def __init__(self, args_snp): self.multiparser = Multiparser() self.seq_editer = SeqEditer() self.helper = Helper() if args_snp.types == "reference": file_type = "compare_reference" else: file_type = "validate_target" self.seq_path = os.path.join(args_snp.out_folder, file_type, "seqs") self.stat_path = os.path.join(args_snp.out_folder, file_type, "statistics") self.fasta_path = os.path.join(args_snp.fastas, "tmp") self.outputs = {"table": os.path.join( args_snp.out_folder, file_type, "SNP_table"), "raw": os.path.join( args_snp.out_folder, file_type, "SNP_raw_outputs"), "tmp": os.path.join(args_snp.out_folder, "tmp_bcf"), "depth": os.path.join(args_snp.out_folder, "tmp_depth")} if "whole_reads.bam" in os.listdir(args_snp.out_folder): self.helper.remove_all_content(args_snp.out_folder, "whole_read", "file") self.bams = {"whole": os.path.join(args_snp.out_folder, "whole_reads.bam"), "sort": os.path.join(args_snp.out_folder, "whole_reads_sorted.bam"), "bams": []} self.header = os.path.join(args_snp.out_folder, "header") self.baqs = {"with": "with_BAQ", "without": "without_BAQ", "extend": "extend_BAQ"}
def __init__(self, args_circ): self.multiparser = Multiparser() self.helper = Helper() self.converter = Converter() self.alignment_path = os.path.join(args_circ.output_folder, "segemehl_align") self.splice_path = os.path.join(args_circ.output_folder, "segemehl_splice") self.candidate_path = os.path.join(args_circ.output_folder, "circRNA_tables") self.gff_folder = os.path.join(args_circ.output_folder, "gffs") self.gff_path = os.path.join(args_circ.gffs, "tmp") self.splices = { "all_file": "splicesites_all.bed", "file": "splicesites.bed", "all": "splicesites_all", "splice": "splicesites" } self.trans = { "all_file": "transrealigned_all.bed", "file": "transrealigned.bed", "all": "transrealigned_all", "trans": "transrealigned" } self.bams = {"whole": "whole_reads.bam", "sort": "whole_reads_sort"} if args_circ.align: if args_circ.fastas is None: print("Error: There is no genome fasta file!!!") sys.exit() else: self.fasta_path = os.path.join(args_circ.fastas, "tmp") else: self.fasta_path = os.path.join(args_circ.fastas, "tmp")
def __init__(self, args_ribo): self.multiparser = Multiparser() self.helper = Helper() self.gff_parser = Gff3Parser() self.gff_path = os.path.join(args_ribo.gffs, "tmp") if args_ribo.tsss is not None: self.tss_path = os.path.join(args_ribo.tsss, "tmp") else: self.tss_path = None self.tran_path = os.path.join(args_ribo.trans, "tmp") self.fasta_path = os.path.join(args_ribo.fastas, "tmp") if (args_ribo.program == "both") or (args_ribo.program == "riboswitch"): (self.ribos_stat_folder, self.ribos_gff_outfolder, self.ribos_table_folder, self.ribos_scan_folder, self.ribos_tmp_files, self.ribos_rfam, self.ribos_suffixs) = self._create_out_folders( args_ribo.ribos_out_folder, "riboswitch", args_ribo.database) if (args_ribo.program == "both") or (args_ribo.program == "thermometer"): (self.thermo_stat_folder, self.thermo_gff_outfolder, self.thermo_table_folder, self.thermo_scan_folder, self.thermo_tmp_files, self.thermo_rfam, self.thermo_suffixs) = self._create_out_folders( args_ribo.thermo_out_folder, "RNA_thermometer", args_ribo.database)
def optimize_tss(args_ops): if len(os.listdir(args_ops.gffs)) == 0: print("Error: There is no gff files!!!") sys.exit() if len(os.listdir(args_ops.fastas)) == 0: print("Error: There is no fasta files!!!") sys.exit() if len(os.listdir(args_ops.wigs)) == 0: print("Error: There is no wiggle files!!!") sys.exit() Multiparser().parser_wig(args_ops.wigs) Multiparser().parser_gff(args_ops.gffs, None) Multiparser().parser_fasta(args_ops.fastas) gff_path = os.path.join(args_ops.gffs, "tmp") wig_path = os.path.join(args_ops.wigs, "tmp") fasta_path = os.path.join(args_ops.fastas, "tmp") for gff in os.listdir(gff_path): if args_ops.project_strain in gff: gff_file = os.path.join(gff_path, gff) break for fa in os.listdir(fasta_path): if args_ops.project_strain in fa: fasta_file = os.path.join(fasta_path, fa) break Helper().check_uni_attributes(gff_file) optimization(wig_path, fasta_file, gff_file, args_ops) Helper().remove_all_content( os.path.join(args_ops.output_folder, "optimized_TSSpredator"), "config", "file") Helper().remove_all_content( os.path.join(args_ops.output_folder, "optimized_TSSpredator"), "Master", "dir") Helper().remove_tmp_dir(args_ops.wigs) Helper().remove_tmp_dir(args_ops.gffs) Helper().remove_tmp_dir(args_ops.fastas)
def __init__(self, args_ribo): self.multiparser = Multiparser() self.helper = Helper() self.gff_parser = Gff3Parser() self.gff_path = os.path.join(args_ribo.gffs, "tmp") self.tss_path = os.path.join(args_ribo.tsss, "tmp") self.tran_path = os.path.join(args_ribo.trans, "tmp") self.fasta_path = os.path.join(args_ribo.fastas, "tmp") self.stat_folder = os.path.join(args_ribo.out_folder, "statistics") self.gff_outfolder = os.path.join(args_ribo.out_folder, "gffs") self.table_folder = os.path.join(args_ribo.out_folder, "tables") self.scan_folder = os.path.join(args_ribo.out_folder, "scan_Rfam") self.ribos_rfam = os.path.join(args_ribo.database, "Rfam_riboswitch.cm") self.tmp_files = { "fasta": os.path.join(args_ribo.out_folder, "tmp_fasta"), "scan": os.path.join(args_ribo.out_folder, "tmp_scan"), "table": os.path.join(args_ribo.out_folder, "tmp_table") } self.suffixs = { "csv": "riboswitch.csv", "txt": "riboswitch_prescan.txt", "re_txt": "riboswitch_scan.txt", "re_csv": "riboswitch_scan.csv" }
def __init__(self, tar_folder, ref_folder): self.multiparser = Multiparser() self.seq_editer = SeqEditer() self.helper = Helper() self.folders = { "tmp_tar": os.path.join(tar_folder, "tmp"), "tmp_ref": os.path.join(ref_folder, "tmp") }
def __init__(self, args_utr): self.helper = Helper() self.multiparser = Multiparser() self.tss_path = os.path.join(args_utr.tsss, "tmp") self.tran_path = os.path.join(args_utr.trans, "tmp") self.utr5_path = os.path.join(args_utr.out_folder, "5UTR") self.utr3_path = os.path.join(args_utr.out_folder, "3UTR") self.utr5_stat_path = os.path.join(self.utr5_path, "statistics") self.utr3_stat_path = os.path.join(self.utr3_path, "statistics")
def setUp(self): self.multiparser = Multiparser() self.example = Example() self.ref_folder = "ref_folder" if (not os.path.exists(self.ref_folder)): os.mkdir(self.ref_folder) self.tar_folder = "tar_folder" if (not os.path.exists(self.tar_folder)): os.mkdir(self.tar_folder)
class TargetFasta(object): '''detection of sRNA target interaction''' def __init__(self, tar_folder, ref_folder): self.multiparser = Multiparser() self.seq_editer = SeqEditer() self.helper = Helper() self.folders = {"tmp_tar": os.path.join(tar_folder, "tmp")} def gen_folder(self, out_folder, ref_files): new_ref_folder = os.path.join(out_folder, "tmp_reference") self.helper.check_make_folder(new_ref_folder) for file_ in ref_files: shutil.copy(file_, new_ref_folder) self.folders["tmp_ref"] = os.path.join(new_ref_folder, "tmp") self.multiparser.parser_fasta(new_ref_folder) if "tmp_tar" in os.listdir(out_folder): shutil.rmtree(self.folders["tmp_tar"]) os.mkdir(self.folders["tmp_tar"]) return new_ref_folder def get_target_fasta(self, mut_table, tar_folder, ref_files, output, out_folder): new_ref_folder = self.gen_folder(out_folder, ref_files) self.seq_editer.modify_seq(self.folders["tmp_ref"], mut_table, self.folders["tmp_tar"]) print("Transfering to target fasta") for file_ in output: first = True datas = file_.split(":") filename = datas[0] strains = datas[1].split(",") out = open(filename, "w") for strain in strains: if strain + ".fa" in os.listdir(self.folders["tmp_tar"]): if first: first = False else: out.write("\n") with open( os.path.join(self.folders["tmp_tar"], strain + ".fa")) as f_h: for line in f_h: out.write(line) else: print( "Error: No fasta information of {0}.fa".format(strain)) out.close() shutil.rmtree(self.folders["tmp_tar"]) shutil.rmtree(self.folders["tmp_ref"]) if "tmp_reference" in os.listdir(out_folder): shutil.rmtree(new_ref_folder) print("Please use the new fasta file to remapping again.")
def __init__(self, args_op): self.multiparser = Multiparser() self.helper = Helper() self.tss_path = os.path.join(args_op.tsss, "tmp") self.tran_path = os.path.join(args_op.trans, "tmp") self.utr5_path = os.path.join(args_op.utr5s, "tmp") self.utr3_path = os.path.join(args_op.utr3s, "tmp") self.table_path = os.path.join(args_op.output_folder, "tables") if args_op.terms is not None: self._check_gff(args_op.terms, "term") self.term_path = os.path.join(args_op.terms, "tmp") else: self.term_path = None
class TargetFasta(object): def __init__(self, tar_folder, ref_folder): self.multiparser = Multiparser() self.seq_editer = SeqEditer() self.helper = Helper() self.folders = { "tmp_tar": os.path.join(tar_folder, "tmp"), "tmp_ref": os.path.join(ref_folder, "tmp") } def get_target_fasta(self, mut_table, tar_folder, ref_folder, output): self.multiparser.parser_fasta(ref_folder) if "tmp" in os.listdir(tar_folder): shutil.rmtree(self.folders["tmp_tar"]) os.mkdir(self.folders["tmp_tar"]) self.seq_editer.modify_seq(self.folders["tmp_ref"], mut_table, self.folders["tmp_tar"]) print("transfer to target fasta...") if output is not None: for file_ in output: first = True datas = file_.split(":") filename = datas[0] strains = datas[1].split("_and_") out = open(os.path.join(tar_folder, filename + ".fa"), "w") for strain in strains: if strain + ".fa" in os.listdir(self.folders["tmp_tar"]): if first: first = False else: out.write("\n") with open( os.path.join(self.folders["tmp_tar"], strain + ".fa")) as f_h: for line in f_h: out.write(line) else: print("Error:no fasta information of {0}.fa".format( strain)) out.close() else: self.helper.move_all_content(self.folders["tmp_tar"], tar_folder, [".fa"]) shutil.rmtree(self.folders["tmp_tar"]) shutil.rmtree(self.folders["tmp_ref"]) self.helper.remove_all_content(ref_folder, "_folder", "dir") print("please use the new fasta file to remapping again.") print("Then copy BAMs and wigs back to input/align_results/BAMs " "and input/align_results/wigs")
def __init__(self, args_term): self.multiparser = Multiparser() self.helper = Helper() self.converter = Converter() self.gff_parser = Gff3Parser() self.gff_path = os.path.join(args_term.gffs, "tmp") self.fasta_path = os.path.join(args_term.fastas, "tmp") self.tran_path = os.path.join(args_term.trans, "tmp") self.outfolder = { "term": os.path.join(args_term.out_folder, "gffs"), "csv": os.path.join(args_term.out_folder, "tables") } self.terms = { "all": os.path.join(self.outfolder["term"], "all_candidates"), "express": os.path.join(self.outfolder["term"], "expressed_candidates"), "best": os.path.join(self.outfolder["term"], "best_candidates"), "non": os.path.join(self.outfolder["term"], "non_expressed_candidates") } self.csvs = { "all": os.path.join(self.outfolder["csv"], "all_candidates"), "express": os.path.join(self.outfolder["csv"], "expressed_candidates"), "best": os.path.join(self.outfolder["csv"], "best_candidates"), "non": os.path.join(self.outfolder["csv"], "non_expressed_candidates") } self.combine_path = os.path.join(self.gff_path, "combine") self.tmps = { "transterm": os.path.join(os.getcwd(), "tmp_transterm"), "hp": "transtermhp", "hp_gff": "transtermhp.gff", "hp_path": "tmp_transterm/tmp", "term_table": os.path.join(os.getcwd(), "tmp_term_table"), "merge": os.path.join(os.getcwd(), "tmp_merge_gff"), "gff": "tmp.gff", "folder": os.path.join(os.getcwd(), "tmp") } self.suffixs = { "gff": "term.gff", "csv": "term.csv", "allgff": "term_all.gff" } if args_term.srnas: self.srna_path = os.path.join(args_term.srnas, "tmp") else: self.srna_path = None self._make_gff_folder()
def __init__(self, args_circ): self.multiparser = Multiparser() self.helper = Helper() self.converter = Converter() self.alignment_path = os.path.join(args_circ.output_folder, "segemehl_alignment_files") self.splice_path = os.path.join(args_circ.output_folder, "segemehl_splice_results") self.candidate_path = os.path.join(args_circ.output_folder, "circRNA_tables") self.gff_folder = os.path.join(args_circ.output_folder, "gffs") self.gff_path = os.path.join(args_circ.gffs, "tmp") self.splices = {"file": "splicesites.bed", "splice": "splicesites"} self.trans = {"file": "transrealigned.bed", "trans": "transrealigned"} self.fasta_path = os.path.join(args_circ.fastas, "tmp")
class TargetFasta(object): def __init__(self, tar_folder, ref_folder): self.multiparser = Multiparser() self.seq_editer = SeqEditer() self.helper = Helper() self.folders = {"tmp_tar": os.path.join(tar_folder, "tmp"), "tmp_ref": os.path.join(ref_folder, "tmp")} def get_target_fasta(self, mut_table, tar_folder, ref_folder, output): self.multiparser.parser_fasta(ref_folder) if "tmp" in os.listdir(tar_folder): shutil.rmtree(self.folders["tmp_tar"]) os.mkdir(self.folders["tmp_tar"]) self.seq_editer.modify_seq(self.folders["tmp_ref"], mut_table, self.folders["tmp_tar"]) print("transfer to target fasta...") if output is not None: for file_ in output: first = True datas = file_.split(":") filename = datas[0] strains = datas[1].split("_and_") out = open(os.path.join(tar_folder, filename + ".fa"), "w") for strain in strains: if strain + ".fa" in os.listdir(self.folders["tmp_tar"]): if first: first = False else: out.write("\n") with open(os.path.join( self.folders["tmp_tar"], strain + ".fa")) as f_h: for line in f_h: out.write(line) else: print("Error:no fasta information of {0}.fa".format( strain)) out.close() else: self.helper.move_all_content(self.folders["tmp_tar"], tar_folder, [".fa"]) shutil.rmtree(self.folders["tmp_tar"]) shutil.rmtree(self.folders["tmp_ref"]) self.helper.remove_all_content(ref_folder, "_folder", "dir") print("please use the new fasta file to remapping again.") print("Then copy BAMs and wigs back to input/align_results/BAMs " "and input/align_results/wigs")
def __init__(self, args_circ): self.multiparser = Multiparser() self.helper = Helper() self.converter = Converter() self.alignment_path = os.path.join(args_circ.output_folder, "segemehl_align") self.splice_path = os.path.join(args_circ.output_folder, "segemehl_splice") self.candidate_path = os.path.join(args_circ.output_folder, "circRNA_tables") self.gff_folder = os.path.join(args_circ.output_folder, "gffs") self.gff_path = os.path.join(args_circ.gffs, "tmp") self.splices = {"all_file": "splicesites_all.bed", "file": "splicesites.bed", "all": "splicesites_all", "splice": "splicesites"} self.trans = {"all_file": "transrealigned_all.bed", "file": "transrealigned.bed", "all": "transrealigned_all", "trans": "transrealigned"} self.bams = {"whole": "whole_reads.bam", "sort": "whole_reads_sort"} if args_circ.align: if args_circ.fastas is None: print("Error: There is no genome fasta file!!!") sys.exit() else: self.fasta_path = os.path.join(args_circ.fastas, "tmp") else: self.fasta_path = os.path.join(args_circ.fastas, "tmp")
def __init__(self, args_cris): self.multiparser = Multiparser() self.helper = Helper() self.gff_parser = Gff3Parser() self.gff_path = os.path.join(args_cris.gffs, "tmp") self.fasta_path = os.path.join(args_cris.fastas, "tmp") self.stat_folder = os.path.join(args_cris.out_folder, "statistics") self.gff_out = os.path.join(args_cris.out_folder, "gffs") self.all_out = os.path.join(args_cris.out_folder, "gffs", "all_candidates") self.best_out = os.path.join(args_cris.out_folder, "gffs", "best") self.helper.check_make_folder(self.all_out) self.helper.check_make_folder(self.best_out) self.data_folder = os.path.join(args_cris.out_folder, "CRT_output") self.helper.check_make_folder(self.data_folder) self.helper.check_make_folder(self.stat_folder)
def __init__(self, args_snp): self.multiparser = Multiparser() self.seq_editer = SeqEditer() self.helper = Helper() if args_snp.types == "reference": file_type = "compare_reference" else: file_type = "validate_target" self.seq_path = os.path.join(args_snp.out_folder, file_type, "seqs") self.stat_path = os.path.join(args_snp.out_folder, file_type, "statistics") self.fasta_path = os.path.join(args_snp.fastas, "tmp") self.outputs = {"table": os.path.join( args_snp.out_folder, file_type, "SNP_table"), "raw": os.path.join( args_snp.out_folder, file_type, "SNP_raw_outputs"), "tmp": os.path.join(args_snp.out_folder, "tmp_bcf")} if "whole_reads.bam" in os.listdir(args_snp.out_folder): self.helper.remove_all_content(args_snp.out_folder, "whole_read", "file") self.bams = {"whole": os.path.join(args_snp.out_folder, "whole_reads.bam"), "sort": os.path.join(args_snp.out_folder, "whole_reads_sorted.bam")} self.header = os.path.join(args_snp.out_folder, "header") self.baqs = {"with": "with_BAQ", "without": "without_BAQ", "extend": "extend_BAQ"}
def __init__(self, args_snp): self.multiparser = Multiparser() self.seq_editer = SeqEditer() self.helper = Helper() if args_snp.types == "related_genome": file_type = "compare_related_and_reference_genomes" else: file_type = "mutations_of_reference_genomes" self.seq_path = os.path.join(args_snp.out_folder, file_type, "seqs") self.stat_path = os.path.join(args_snp.out_folder, file_type, "statistics") self.fig_path = os.path.join(self.stat_path, "figs") self.helper.check_make_folder(self.fig_path) self.outputs = { "table": os.path.join(args_snp.out_folder, file_type, "SNP_tables"), "raw": os.path.join(args_snp.out_folder, file_type, "SNP_raw_outputs"), "tmp": os.path.join(args_snp.out_folder, "tmp_bcf"), "depth": os.path.join(args_snp.out_folder, "tmp_depth") } self.bams = { "whole": os.path.join(args_snp.out_folder, "whole_reads.bam"), "sort": os.path.join(args_snp.out_folder, "whole_reads_sorted.bam"), "bams": [] } self.header = os.path.join(args_snp.out_folder, "header") self.baqs = { "with": "with_BAQ", "without": "without_BAQ", "extend": "extend_BAQ" }
def __init__(self, args_ribo): self.multiparser = Multiparser() self.helper = Helper() self.gff_parser = Gff3Parser() self.gff_path = os.path.join(args_ribo.gffs, "tmp") if args_ribo.tsss is not None: self.tss_path = os.path.join(args_ribo.tsss, "tmp") else: self.tss_path = None self.tran_path = os.path.join(args_ribo.trans, "tmp") self.fasta_path = os.path.join(args_ribo.fastas, "tmp") if (args_ribo.program == "both") or ( args_ribo.program == "riboswitch"): (self.ribos_stat_folder, self.ribos_gff_outfolder, self.ribos_table_folder, self.ribos_scan_folder, self.ribos_tmp_files, self.ribos_rfam, self.ribos_suffixs) = self._create_out_folders( args_ribo.ribos_out_folder, "riboswitch", args_ribo.database) if (args_ribo.program == "both") or ( args_ribo.program == "thermometer"): (self.thermo_stat_folder, self.thermo_gff_outfolder, self.thermo_table_folder, self.thermo_scan_folder, self.thermo_tmp_files, self.thermo_rfam, self.thermo_suffixs) = self._create_out_folders( args_ribo.thermo_out_folder, "RNA_thermometer", args_ribo.database)
def __init__(self, args_go): self.multiparser = Multiparser() self.helper = Helper() self.out_all = os.path.join(args_go.out_folder, "all_CDS") self.out_express = os.path.join(args_go.out_folder, "expressed_CDS") self.result_all_path = os.path.join(self.out_all, "Go_term_results") self.result_express_path = os.path.join(self.out_express, "Go_term_results") self.gff_path = os.path.join(args_go.gffs, "tmp") if args_go.trans is not None: self.tran_path = os.path.join(args_go.trans, "tmp") else: self.tran_path = None self.stat_all_path = os.path.join(self.out_all, "statistics") self.stat_express_path = os.path.join(self.out_express, "statistics") self.all_strain = "all_strains_uniprot.csv"
def __init__(self, args_sorf): self.multiparser = Multiparser() self.helper = Helper() if args_sorf.tsss is not None: self.tss_path = os.path.join(args_sorf.tsss, "tmp") else: self.tss_path = None if args_sorf.srnas is not None: self.srna_path = os.path.join(args_sorf.srnas, "tmp") else: self.srna_path = None self.gff_output = os.path.join(args_sorf.out_folder, "gffs") self.table_output = os.path.join(args_sorf.out_folder, "tables") self.tran_path = os.path.join(args_sorf.trans, "tmp") self.fasta_path = os.path.join(args_sorf.fastas, "tmp") self.all_cand = "all_candidates" self.best = "best"
def __init__(self, args_tar): self.multiparser = Multiparser() self.helper = Helper() self.fixer = FormatFixer() self.gff_parser = Gff3Parser() self.target_seq_path = os.path.join(args_tar.out_folder, "target_seqs") self.srna_seq_path = os.path.join(args_tar.out_folder, "sRNA_seqs") self.rnaplex_path = os.path.join(args_tar.out_folder, "RNAplex_results") self.rnaup_path = os.path.join(args_tar.out_folder, "RNAup_results") self.intarna_path = os.path.join(args_tar.out_folder, "IntaRNA_results") self.merge_path = os.path.join(args_tar.out_folder, "merged_results") self.srna_path = os.path.join(args_tar.srnas, "tmp") self.fasta_path = os.path.join(args_tar.fastas, "tmp") self.gff_path = os.path.join(args_tar.gffs, "tmp") self.tmps = {"tmp": "tmp_srna_target", "rnaup": "tmp_rnaup", "log": "tmp_log", "all_fa": "tmp*.fa", "all_txt": "tmp*.txt"}
def __init__(self, args_ratt): self.multiparser = Multiparser() self.converter = Converter() self.format_fixer = FormatFixer() self.helper = Helper() self.gbk = os.path.join(args_ratt.ref_embls, "gbk_tmp") self.gbk_tmp = os.path.join(self.gbk, "tmp") self.embl = os.path.join(args_ratt.ref_embls, "embls") self.ratt_log = os.path.join(args_ratt.output_path, "ratt_log.txt") self.tmp_files = { "tar": os.path.join(args_ratt.tar_fastas, "tmp"), "ref": os.path.join(args_ratt.ref_fastas, "tmp"), "out_gff": os.path.join(args_ratt.gff_outfolder, "tmp"), "gff": os.path.join(args_ratt.gff_outfolder, "tmp.gff"), "ptt": os.path.join(args_ratt.gff_outfolder, "tmp.ptt"), "rnt": os.path.join(args_ratt.gff_outfolder, "tmp.rnt") }
def __init__(self, args_tss): self.multiparser = Multiparser() self.helper = Helper() self.converter = Converter() self.master = os.path.join(args_tss.out_folder, "MasterTables") self.tmps = {"tss": "tmp_TSS", "ta_tss": "tmp_ta_tss", "tss_ta": "tmp_tss", "tmp": "tmp"} if args_tss.ta_files is not None: self.tmps["ta"] = os.path.join(args_tss.ta_files, "tmp") else: self.tmps["ta"] = None self.gff_path = os.path.join(args_tss.gffs, "tmp") if args_tss.manual is not None: self.manual_path = os.path.join(args_tss.manual, "tmp") self.wig_path = os.path.join(args_tss.wig_folder, "tmp") self.fasta_path = os.path.join(args_tss.fastas, "tmp") self.stat_outfolder = os.path.join(args_tss.out_folder, "statistics") self.gff_outfolder = os.path.join(args_tss.out_folder, "gffs")
def __init__(self, args_tran): self.multiparser = Multiparser() self.helper = Helper() self.converter = Converter() self.gff_outfolder = os.path.join(args_tran.out_folder, "gffs") self.tran_path = os.path.join(self.gff_outfolder, "tmp") self.stat_path = os.path.join(args_tran.out_folder, "statistics") self.tmps = {"gff": "tmp.gff", "merge": "tmp_merge", "tran": os.path.join(args_tran.out_folder, "tmp_tran"), "tss_ta": os.path.join(self.gff_outfolder, "tmp_tss_ta"), "ta_tss": os.path.join(self.gff_outfolder, "tmp_ta_tss"), "ta_gff": os.path.join(self.gff_outfolder, "tmp_ta_gff"), "gff_ta": os.path.join(self.gff_outfolder, "tmp_gff_ta"), "uni": os.path.join(self.gff_outfolder, "tmp_uni"), "overlap": os.path.join( self.gff_outfolder, "tmp_overlap")} self.frag = "transcript_fragment.gff" self.tex = "transcript_tex_notex.gff" self.endfix_tran = "transcript.gff"
def __init__(self, args_srna): self.args_container = ArgsContainer() self.helper = Helper() self.multiparser = Multiparser() self.gff_output = os.path.join(args_srna.out_folder, "gffs") self.table_output = os.path.join(args_srna.out_folder, "tables") self.stat_path = os.path.join(args_srna.out_folder, "statistics") self.tss_path = self._check_folder_exist(args_srna.tss_folder) self.pro_path = self._check_folder_exist(args_srna.pro_folder) self.sorf_path = self._check_folder_exist(args_srna.sorf_file) self.fasta_path = os.path.join(args_srna.fastas, "tmp") self.tran_path = os.path.join(args_srna.trans, "tmp") self.term_path = self._check_folder_exist(args_srna.terms) self.merge_wigs = os.path.join(args_srna.out_folder, "merge_wigs") self.prefixs = { "merge": os.path.join(args_srna.out_folder, "tmp_merge"), "utr": os.path.join(args_srna.out_folder, "tmp_utrsrna"), "normal": os.path.join(args_srna.out_folder, "tmp_normal"), "in_cds": os.path.join(args_srna.out_folder, "tmp_incds"), "merge_table": os.path.join(args_srna.out_folder, "tmp_merge_table"), "utr_table": os.path.join(args_srna.out_folder, "tmp_utrsrna_table"), "normal_table": os.path.join(args_srna.out_folder, "tmp_normal_table"), "in_cds_table": os.path.join(args_srna.out_folder, "tmp_incds_table"), "basic": os.path.join(args_srna.out_folder, "tmp_basic"), "energy": os.path.join(args_srna.out_folder, "tmp_energy") } self.tmps = { "nr": os.path.join(args_srna.out_folder, "tmp_nr"), "srna": os.path.join(args_srna.out_folder, "tmp_sRNA") } self.best_table = os.path.join(self.table_output, "best") self.table_output = os.path.join(args_srna.out_folder, "tables") self.stat_path = os.path.join(args_srna.out_folder, "statistics") self.all_best = { "all_gff": os.path.join(self.gff_output, "all_candidates"), "best_gff": os.path.join(self.gff_output, "best"), "all_table": os.path.join(self.table_output, "all_candidates"), "best_table": os.path.join(self.table_output, "best") }
def __init__(self, args_pro): self.multiparser = Multiparser() self.helper = Helper() self.tss_path = os.path.join(args_pro.tsss, "tmp") if args_pro.gffs is not None: self.gff_path = os.path.join(args_pro.gffs, "tmp") else: self.gff_path = None self.out_fasta = os.path.join(args_pro.output_folder, "fasta_classes") self.tmp_folder = os.path.join(os.getcwd(), "tmp") self.fastas = {"pri": os.path.join(self.tmp_folder, "primary.fa"), "sec": os.path.join(self.tmp_folder, "secondary.fa"), "inter": os.path.join(self.tmp_folder, "internal.fa"), "anti": os.path.join(self.tmp_folder, "antisense.fa"), "orph": os.path.join(self.tmp_folder, "orphan.fa"), "all_no_orph": "without_orphan.fa", "all": "all_type.fa", "tmp_fa": os.path.join(self.tmp_folder, "tmp.fa"), "tmp_all": os.path.join(self.tmp_folder, "tmp_all.fa")} self.all_fasta = os.path.join(args_pro.fastas, "allfasta.fa") self.all_tss = os.path.join(self.tss_path, "allfasta_TSS.gff")
def __init__(self, args_sub): self.multiparser = Multiparser() self.helper = Helper() self.fixer = FormatFixer() self.gff_path = os.path.join(args_sub.gffs, "tmp") self.fasta_path = os.path.join(args_sub.fastas, "tmp") if args_sub.trans is not None: self.tran_path = os.path.join(args_sub.trans, "tmp") else: self.tran_path = None self.out_all = os.path.join(args_sub.out_folder, "all_CDS") self.out_express = os.path.join(args_sub.out_folder, "expressed_CDS") self.all_tmp_path = os.path.join(self.out_all, "tmp") self.express_tmp_path = os.path.join(self.out_express, "tmp") self.all_stat_path = os.path.join(self.out_all, "statistics") self.express_stat_path = os.path.join(self.out_express, "statistics") self.all_tmp_result = os.path.join(self.out_all, "tmp_results") self.express_tmp_result = os.path.join(self.out_express, "tmp_results") self.all_result = os.path.join(self.out_all, "psortb_results") self.express_result = os.path.join(self.out_express, "psortb_results") self.endfix_table = "table.csv" self.endfix_raw = "raw.txt" self._make_folder()
def __init__(self, gffs): self.multiparser = Multiparser() self.helper = Helper() self.out_folder = os.path.join(gffs, "for_libs") if os.path.exists(self.out_folder): shutil.rmtree(self.out_folder) os.mkdir(self.out_folder) self.stat = os.path.join(self.out_folder, "statistics") os.mkdir(self.stat) self.gff_folder = os.path.join(self.out_folder, "gffs") os.mkdir(self.gff_folder) self.merge_wigs = os.path.join(gffs, "merge_wigs") if os.path.exists(self.merge_wigs): shutil.rmtree(self.merge_wigs)
def __init__(self, args_tar): self.multiparser = Multiparser() self.helper = Helper() self.fixer = FormatFixer() self.gff_parser = Gff3Parser() self.target_seq_path = os.path.join(args_tar.out_folder, "target_seqs") self.srna_seq_path = os.path.join(args_tar.out_folder, "sRNA_seqs") self.rnaplex_path = os.path.join(args_tar.out_folder, "RNAplex") self.rnaup_path = os.path.join(args_tar.out_folder, "RNAup") self.merge_path = os.path.join(args_tar.out_folder, "merge") self.srna_path = os.path.join(args_tar.srnas, "tmp") self.fasta_path = os.path.join(args_tar.fastas, "tmp") self.gff_path = os.path.join(args_tar.gffs, "tmp") self.tmps = {"tmp": "tmp", "rnaup": "tmp_rnaup", "log": "tmp_log", "all_fa": "tmp*.fa", "all_txt": "tmp*.txt"}
def __init__(self, args_tss): self.multiparser = Multiparser() self.helper = Helper() self.converter = Converter() self.master = os.path.join(args_tss.out_folder, "MasterTables") self.tmps = {"tss": "tmp_TSS", "ta_tss": "tmp_ta_tss", "tss_ta": "tmp_tss", "tmp": "tmp"} if args_tss.ta_files is not None: self.tmps["ta"] = os.path.join(args_tss.ta_files, "tmp") else: self.tmps["ta"] = None self.gff_path = os.path.join(args_tss.gffs, "tmp") self.wig_path = os.path.join(args_tss.wig_folder, "tmp") self.fasta_path = os.path.join(args_tss.fastas, "tmp") self.stat_outfolder = os.path.join(args_tss.out_folder, "statistics") self.gff_outfolder = os.path.join(args_tss.out_folder, "gffs")
def __init__(self, args_sorf): self.multiparser = Multiparser() self.helper = Helper() if args_sorf.tsss is not None: self.tss_path = os.path.join(args_sorf.tsss, "tmp") else: self.tss_path = None if args_sorf.srnas is not None: self.srna_path = os.path.join(args_sorf.srnas, "tmp") else: self.srna_path = None self.gff_output = os.path.join(args_sorf.out_folder, "gffs") self.table_output = os.path.join(args_sorf.out_folder, "tables") self.tran_path = os.path.join(args_sorf.trans, "tmp") self.fasta_path = os.path.join(args_sorf.fastas, "tmp") self.all_cand = "all_candidates" self.best = "best_candidates"
def __init__(self, args_go): self.multiparser = Multiparser() self.helper = Helper() self.out_all = os.path.join(args_go.out_folder, "all_CDSs") self.out_express = os.path.join(args_go.out_folder, "expressed_CDSs") self.result_all_path = os.path.join(self.out_all, "GO_term_results") self.result_express_path = os.path.join(self.out_express, "GO_term_results") self.gff_path = os.path.join(args_go.gffs, "tmp") if args_go.trans is not None: self.tran_path = os.path.join(args_go.trans, "tmp") else: self.tran_path = None self.stat_all_path = os.path.join(self.out_all, "statistics") self.stat_express_path = os.path.join(self.out_express, "statistics") self.all_strain = "all_genomes_uniprot.csv"
def __init__(self, args_srna): self.args_container = ArgsContainer() self.helper = Helper() self.multiparser = Multiparser() self.gff_output = os.path.join(args_srna.out_folder, "gffs") self.table_output = os.path.join(args_srna.out_folder, "tables") self.stat_path = os.path.join(args_srna.out_folder, "statistics") self.tss_path = self._check_folder_exist(args_srna.tss_folder) self.pro_path = self._check_folder_exist(args_srna.pro_folder) self.sorf_path = self._check_folder_exist(args_srna.sorf_file) self.fasta_path = os.path.join(args_srna.fastas, "tmp") self.tran_path = os.path.join(args_srna.trans, "tmp") self.term_path = self._check_folder_exist(args_srna.terms) self.merge_wigs = os.path.join(args_srna.out_folder, "merge_wigs") self.prefixs = {"merge": os.path.join( args_srna.out_folder, "tmp_merge"), "utr": os.path.join( args_srna.out_folder, "tmp_utrsrna"), "normal": os.path.join( args_srna.out_folder, "tmp_normal"), "in_cds": os.path.join( args_srna.out_folder, "tmp_incds"), "merge_table": os.path.join( args_srna.out_folder, "tmp_merge_table"), "utr_table": os.path.join( args_srna.out_folder, "tmp_utrsrna_table"), "normal_table": os.path.join( args_srna.out_folder, "tmp_normal_table"), "in_cds_table": os.path.join( args_srna.out_folder, "tmp_incds_table"), "basic": os.path.join( args_srna.out_folder, "tmp_basic"), "energy": os.path.join( args_srna.out_folder, "tmp_energy")} self.tmps = {"nr": os.path.join(args_srna.out_folder, "tmp_nr"), "srna": os.path.join(args_srna.out_folder, "tmp_sRNA")} self.best_table = os.path.join(self.table_output, "best") self.table_output = os.path.join(args_srna.out_folder, "tables") self.stat_path = os.path.join(args_srna.out_folder, "statistics") self.all_best = {"all_gff": os.path.join( self.gff_output, "all_candidates"), "best_gff": os.path.join(self.gff_output, "best"), "all_table": os.path.join( self.table_output, "all_candidates"), "best_table": os.path.join(self.table_output, "best")}
def __init__(self, out_folder): self.multiparser = Multiparser() self.helper = Helper() self.converter = Converter() self.gffparser = Gff3Parser() self.tmp_id = os.path.join(out_folder, "tmp_id_list") self.all_result = os.path.join(out_folder, "all_results") self.best_result = os.path.join(out_folder, "best_results") self.fig = os.path.join(out_folder, "figures") self.with_strain = "with_strain" self.without_strain = "without_strain" self.tmp_files = { "log": "tmp_log", "action": "tmp_action.log", "pubmed": "tmp_pubmed.log", "specific": os.path.join(out_folder, "tmp_specific"), "nospecific": os.path.join(out_folder, "tmp_nospecific"), "wget_action": os.path.join(out_folder, "tmp_action") }
def __init__(self, args_ratt): self.multiparser = Multiparser() self.converter = Converter() self.format_fixer = FormatFixer() self.helper = Helper() self.gbk = os.path.join(args_ratt.ref_embls, "gbk_tmp") self.gbk_tmp = os.path.join(self.gbk, "tmp") self.embl = os.path.join(args_ratt.ref_embls, "embls") self.ratt_log = os.path.join(args_ratt.output_path, "ratt_log.txt") self.tmp_files = {"tar": os.path.join(args_ratt.tar_fastas, "tmp"), "ref": os.path.join(args_ratt.ref_fastas, "tmp"), "out_gff": os.path.join(args_ratt.gff_outfolder, "tmp"), "gff": os.path.join(args_ratt.gff_outfolder, "tmp.gff"), "ptt": os.path.join(args_ratt.gff_outfolder, "tmp.ptt"), "rnt": os.path.join(args_ratt.gff_outfolder, "tmp.rnt")}
def __init__(self, args_sc): self.multiparser = Multiparser() self.helper = Helper() out_folder = os.path.join(args_sc.output_folder, "screenshots") if os.path.exists(out_folder): print("Error: The {0} already exist!!!".format(out_folder)) sys.exit() else: os.mkdir(out_folder) args_sc.output_folder = out_folder filename = args_sc.fasta.split("/")[-1] self.strain = ".".join(filename.split(".")[0:-1]) self.helper.check_make_folder( os.path.join(args_sc.output_folder, self.strain)) self.forward_file = os.path.join(args_sc.output_folder, self.strain, "forward") self.reverse_file = os.path.join(args_sc.output_folder, self.strain, "reverse") os.mkdir(self.forward_file) os.mkdir(self.reverse_file)
def __init__(self, args_term): self.multiparser = Multiparser() self.helper = Helper() self.converter = Converter() self.gff_parser = Gff3Parser() self.gff_path = os.path.join(args_term.gffs, "tmp") self.fasta_path = os.path.join(args_term.fastas, "tmp") self.tran_path = os.path.join(args_term.trans, "tmp") self.outfolder = {"term": os.path.join(args_term.out_folder, "gffs"), "csv": os.path.join(args_term.out_folder, "tables")} self.terms = {"all": os.path.join(self.outfolder["term"], "all_candidates"), "express": os.path.join(self.outfolder["term"], "expressed_candidates"), "best": os.path.join(self.outfolder["term"], "best_candidates"), "non": os.path.join(self.outfolder["term"], "non_expressed_candidates")} self.csvs = {"all": os.path.join(self.outfolder["csv"], "all_candidates"), "express": os.path.join(self.outfolder["csv"], "expressed_candidates"), "best": os.path.join(self.outfolder["csv"], "best_candidates"), "non": os.path.join(self.outfolder["csv"], "non_expressed_candidates")} self.combine_path = os.path.join(self.gff_path, "combine") self.tmps = {"transterm": os.path.join(os.getcwd(), "tmp_transterm"), "hp": "transtermhp", "hp_gff": "transtermhp.gff", "hp_path": "tmp_transterm/tmp", "term_table": os.path.join(os.getcwd(), "tmp_term_table"), "merge": os.path.join(os.getcwd(), "tmp_merge_gff"), "gff": "tmp.gff", "folder": os.path.join(os.getcwd(), "tmp")} self.suffixs = {"gff": "term.gff", "csv": "term.csv", "allgff": "term_all.gff"} if args_term.srnas: self.srna_path = os.path.join(args_term.srnas, "tmp") else: self.srna_path = None self._make_gff_folder()
def __init__(self, args_ribo): self.multiparser = Multiparser() self.helper = Helper() self.gff_parser = Gff3Parser() self.gff_path = os.path.join(args_ribo.gffs, "tmp") self.tss_path = os.path.join(args_ribo.tsss, "tmp") self.tran_path = os.path.join(args_ribo.trans, "tmp") self.fasta_path = os.path.join(args_ribo.fastas, "tmp") self.stat_folder = os.path.join(args_ribo.out_folder, "statistics") self.gff_outfolder = os.path.join(args_ribo.out_folder, "gffs") self.table_folder = os.path.join(args_ribo.out_folder, "tables") self.scan_folder = os.path.join(args_ribo.out_folder, "scan_Rfam") self.ribos_rfam = os.path.join(args_ribo.database, "Rfam_riboswitch.cm") self.tmp_files = {"fasta": os.path.join( args_ribo.out_folder, "tmp_fasta"), "scan": os.path.join( args_ribo.out_folder, "tmp_scan"), "table": os.path.join( args_ribo.out_folder, "tmp_table")} self.suffixs = {"csv": "riboswitch.csv", "txt": "riboswitch_prescan.txt", "re_txt": "riboswitch_scan.txt", "re_csv": "riboswitch_scan.csv"}
class TSSpredator(object): def __init__(self, args_tss): self.multiparser = Multiparser() self.helper = Helper() self.converter = Converter() self.master = os.path.join(args_tss.out_folder, "MasterTables") self.tmps = { "tss": "tmp_TSS", "ta_tss": "tmp_ta_tss", "tss_ta": "tmp_tss", "tmp": "tmp" } if args_tss.ta_files is not None: self.tmps["ta"] = os.path.join(args_tss.ta_files, "tmp") else: self.tmps["ta"] = None self.gff_path = os.path.join(args_tss.gffs, "tmp") self.wig_path = os.path.join(args_tss.wig_folder, "tmp") self.fasta_path = os.path.join(args_tss.fastas, "tmp") self.stat_outfolder = os.path.join(args_tss.out_folder, "statistics") self.gff_outfolder = os.path.join(args_tss.out_folder, "gffs") def _assign_dict(self, lib_datas): return { "wig": lib_datas[0], "tex": lib_datas[1], "condition": int(lib_datas[2]), "replicate": lib_datas[3], "strand": lib_datas[4] } def _print_lib(self, lib_num, lib_list, out, wig_folder, prefix, rep_set): for num_id in range(1, lib_num + 1): cond_list = [] for lib in lib_list: if num_id == lib["condition"]: cond_list.append(lib) cond_sort_list = sorted(cond_list, key=lambda k: k['replicate']) reps = [] for cond in cond_sort_list: out.write("{0}_{1}{2} = {3}\n".format( prefix, cond["condition"], cond["replicate"], os.path.join(wig_folder, cond["wig"]))) reps.append(cond["replicate"]) for rep in sorted(rep_set): if rep not in reps: out.write("{0}_{1}{2} = \n".format(prefix, cond["condition"], rep)) def _start_to_run(self, tsspredator_path, config_file, out_path, prefix): print("Running TSSpredator for " + prefix) out = open(os.path.join(out_path, "log.txt"), "w") err = open(os.path.join(out_path, "err.txt"), "w") call(["java", "-jar", tsspredator_path, config_file], stdout=out, stderr=err) out.close() err.close() def _import_lib(self, libs, wig_folder, project_strain_name, out, gff, program, fasta): lib_dict = {"fp": [], "fm": [], "nm": [], "np": []} lib_num = 0 rep_set = set() list_num_id = [] print("Runniun {0} now...".format(program)) for lib in libs: lib_datas = lib.split(":") if not lib_datas[0].endswith(".wig"): print("Error:Exist a not proper wig files!!") sys.exit() for wig in os.listdir(wig_folder): filename = wig.split("_STRAIN_") if (filename[0] == lib_datas[0][:-4]) and (filename[1][:-4] == project_strain_name): lib_datas[0] = wig if int(lib_datas[2]) > lib_num: lib_num = int(lib_datas[2]) if lib_datas[3] not in rep_set: rep_set.add(lib_datas[3]) if (lib_datas[1] == "tex") and (lib_datas[4] == "+"): lib_dict["fp"].append(self._assign_dict(lib_datas)) elif (lib_datas[1] == "tex") and (lib_datas[4] == "-"): lib_dict["fm"].append(self._assign_dict(lib_datas)) elif (lib_datas[1] == "notex") and (lib_datas[4] == "+"): lib_dict["np"].append(self._assign_dict(lib_datas)) elif (lib_datas[1] == "notex") and (lib_datas[4] == "-"): lib_dict["nm"].append(self._assign_dict(lib_datas)) for num_id in range(1, lib_num + 1): out.write("annotation_{0} = {1}\n".format(num_id, gff)) if program.lower() == "tss": self._print_lib(lib_num, lib_dict["fm"], out, wig_folder, "fivePrimeMinus", rep_set) self._print_lib(lib_num, lib_dict["fp"], out, wig_folder, "fivePrimePlus", rep_set) elif program.lower() == "processing_site": self._print_lib(lib_num, lib_dict["nm"], out, wig_folder, "fivePrimeMinus", rep_set) self._print_lib(lib_num, lib_dict["np"], out, wig_folder, "fivePrimePlus", rep_set) else: print("Error: Wrong program name!!!") sys.exit() for num_id in range(1, lib_num + 1): out.write("genome_{0} = {1}\n".format(num_id, fasta)) for num_id in range(1, lib_num + 1): list_num_id.append(str(num_id)) return lib_num, num_id, rep_set, lib_dict, list_num_id def _print_repmatch(self, args_tss, out): '''check replicate match''' if "all" in args_tss.repmatch: match = args_tss.repmatch.split("_")[-1] out.write("minNumRepMatches = {0}\n".format(match)) else: nums = {} matchs = {} for match in args_tss.repmatch.split(","): lib = match.split("_")[0] rep = match.split("_")[-1] matchs[lib] = rep if rep not in nums.keys(): nums[rep] = 1 else: nums[rep] += 1 for rep, num in nums.items(): if num == max(nums.values()): out.write("minNumRepMatches = {0}\n".format(rep)) max_rep = rep break for lib, rep in matchs.items(): if rep != max_rep: out.write("minNumRepMatches_{0} = {1}\n".format(lib, rep)) def _gen_config(self, project_strain_name, args_tss, gff, wig_folder, fasta, config_file): '''generation of config files''' master_folder = "MasterTable_" + project_strain_name out_path = os.path.join(self.master, master_folder) self.helper.check_make_folder(out_path) out = open(config_file, "w") out.write("TSSinClusterSelectionMethod = HIGHEST\n") out.write("allowedCompareShift = 1\n") out.write("allowedRepCompareShift = 1\n") lib_num, num_id, rep_set, lib_dict, list_num_id = \ self._import_lib(args_tss.libs, wig_folder, project_strain_name, out, gff, args_tss.program, fasta) out.write("idList = ") out.write(",".join(list_num_id) + "\n") out.write("maxASutrLength = 100\n") out.write("maxGapLengthInGene = 500\n") out.write("maxNormalTo5primeFactor = {0}\n".format( args_tss.processing_factor)) out.write("maxTSSinClusterDistance = {0}\n".format(args_tss.cluster + 1)) out.write("maxUTRlength = {0}\n".format(args_tss.utr_length)) out.write("min5primeToNormalFactor = {0}\n".format( args_tss.enrichment_factor)) out.write("minCliffFactor = {0}\n".format(args_tss.factor)) out.write("minCliffFactorDiscount = {0}\n".format( args_tss.factor_reduction)) out.write("minCliffHeight = {0}\n".format(args_tss.height)) out.write("minCliffHeightDiscount = {0}\n".format( args_tss.height_reduction)) out.write("minNormalHeight = {0}\n".format(args_tss.base_height)) self._print_repmatch(args_tss, out) out.write("minPlateauLength = 0\n") out.write("mode = cond\n") out.write("normPercentile = 0.9\n") if args_tss.program.lower() == "tss": self._print_lib(lib_num, lib_dict["nm"], out, wig_folder, "normalMinus", rep_set) self._print_lib(lib_num, lib_dict["np"], out, wig_folder, "normalPlus", rep_set) else: self._print_lib(lib_num, lib_dict["fm"], out, wig_folder, "normalMinus", rep_set) self._print_lib(lib_num, lib_dict["fp"], out, wig_folder, "normalPlus", rep_set) out.write("numReplicates = {0}\n".format(len(rep_set))) out.write("numberOfDatasets = {0}\n".format(lib_num)) out.write("outputDirectory = {0}\n".format(out_path)) for prefix_id in range(len(args_tss.output_prefixs)): out.write("outputPrefix_{0} = {1}\n".format( prefix_id + 1, args_tss.output_prefixs[prefix_id])) out.write("projectName = {0}\n".format(project_strain_name)) out.write("superGraphCompatibility = igb\n") out.write("texNormPercentile = 0.5\n") out.write("writeGraphs = 0\n") out.write("writeNocornacFiles = 0\n") out.close() def _convert_gff(self, prefixs, args_tss): for prefix in prefixs: out_file = os.path.join( self.gff_outfolder, "_".join([prefix, args_tss.program]) + ".gff") gff_f = open(out_file, "w") out_path = os.path.join(self.master, "_".join(["MasterTable", prefix])) if "MasterTable.tsv" not in os.listdir(out_path): print("Error:there is not MasterTable file in {0}".format( out_path)) print("Please check configuration file.") else: if args_tss.program.lower() == "processing": feature = "processing_site" elif args_tss.program.lower() == "tss": feature = "TSS" self.converter.convert_mastertable2gff( os.path.join(out_path, "MasterTable.tsv"), "ANNOgesic", feature, prefix, out_file) gff_f.close() def _merge_manual(self, tsss, args_tss): '''if manual detected TSS is provided, it can merge manual detected TSS and TSSpredator predicted TSS''' self.helper.check_make_folder( os.path.join(os.getcwd(), self.tmps["tss"])) for tss in tsss: for gff in os.listdir(args_tss.gffs): if (gff[:-4] == tss) and (".gff" in gff): break filename = "_".join([tss, args_tss.program]) + ".gff" predict = os.path.join(self.gff_outfolder, filename) print("Running merge and classify manual ....") stat_file = "stat_compare_TSSpredator_manual_{0}.csv".format(tss) merge_manual_predict_tss(predict, stat_file, os.path.join(self.tmps["tss"], filename), os.path.join(args_tss.gffs, gff), args_tss) shutil.move( stat_file, os.path.join(args_tss.out_folder, "statistics", tss, stat_file)) self.helper.move_all_content(self.tmps["tss"], self.gff_outfolder, [".gff"]) shutil.rmtree(self.tmps["tss"]) def _validate(self, tsss, args_tss): '''validate TSS with genome annotation''' print("Running validation of annotation....") for tss in tsss: for gff in os.listdir(args_tss.gffs): if (gff[:-4] == tss) and (".gff" in gff): break stat_file = os.path.join(self.stat_outfolder, tss, "".join(["stat_gene_vali_", tss, ".csv"])) out_cds_file = os.path.join(args_tss.out_folder, "tmp.gff") if args_tss.program.lower() == "tss": compare_file = os.path.join(self.gff_outfolder, "_".join([tss, "TSS.gff"])) elif args_tss.program.lower() == "processing": compare_file = os.path.join(self.gff_outfolder, "_".join([tss, "processing.gff"])) validate_gff(compare_file, os.path.join(args_tss.gffs, gff), stat_file, out_cds_file, args_tss.utr_length, args_tss.program.lower()) shutil.move(out_cds_file, os.path.join(args_tss.gffs, gff)) def _compare_ta(self, tsss, args_tss): '''compare TSS with transcript''' detect = False print("Running compare transcript assembly and TSS ...") self.multiparser.parser_gff(args_tss.ta_files, "transcript") self.multiparser.combine_gff(args_tss.gffs, self.tmps["ta"], None, "transcript") for tss in tsss: stat_out = os.path.join( self.stat_outfolder, tss, "".join(["stat_compare_TSS_transcript_", tss, ".csv"])) for ta in os.listdir(self.tmps["ta"]): filename = ta.split("_transcript") if (filename[0] == tss) and (filename[1] == ".gff"): detect = True break compare_file = os.path.join(self.gff_outfolder, "_".join([tss, "TSS.gff"])) if detect: stat_ta_tss(os.path.join(self.tmps["ta"], ta), compare_file, stat_out, self.tmps["ta_tss"], self.tmps["tss_ta"], args_tss.fuzzy) self.helper.sort_gff(self.tmps["tss_ta"], compare_file) self.helper.sort_gff(self.tmps["ta_tss"], os.path.join(args_tss.ta_files, ta)) os.remove(self.tmps["tss_ta"]) os.remove(self.tmps["ta_tss"]) detect = False def _stat_tss(self, tsss, feature): print("Running statistaics.....") for tss in tsss: compare_file = os.path.join(self.gff_outfolder, "_".join([tss, feature]) + ".gff") stat_tsspredator( compare_file, feature, os.path.join( self.stat_outfolder, tss, "_".join(["stat", feature, "class", tss]) + ".csv"), os.path.join(self.stat_outfolder, tss, "_".join(["stat", feature, "libs", tss]) + ".csv")) self.helper.move_all_content( os.getcwd(), os.path.join(self.stat_outfolder, tss), ["_class", ".png"]) if os.path.exists( os.path.join(self.stat_outfolder, "TSSstatistics.tsv")): shutil.move( os.path.join(self.stat_outfolder, "TSSstatistics.tsv"), os.path.join(self.stat_outfolder, tss, "TSSstatistics.tsv")) plot_venn(compare_file, feature) self.helper.move_all_content( os.getcwd(), os.path.join(self.stat_outfolder, tss), ["_venn", ".png"]) def _set_gen_config(self, args_tss, input_folder): prefixs = [] detect = False for fasta in os.listdir(self.fasta_path): for gff in os.listdir(self.gff_path): if fasta[:-3] == gff[:-4]: prefix = fasta[:-3] for wig in os.listdir(self.wig_path): filename = wig.split("_STRAIN_") if filename[1][:-4] == prefix: detect = True break if detect: prefixs.append(prefix) config = os.path.join( input_folder, "_".join(["config", prefix]) + ".ini") self._gen_config(prefix, args_tss, os.path.join(self.gff_path, gff), self.wig_path, os.path.join(self.fasta_path, fasta), config) return prefixs def _merge_wigs(self, wig_folder, prefix, libs): self.helper.check_make_folder( os.path.join(os.getcwd(), self.tmps["tmp"])) for wig_file in os.listdir(wig_folder): for lib in libs: info = lib.split(":") if (info[0][:-4] in wig_file) and (info[-1] == "+") and ( prefix in wig_file) and (os.path.isfile( os.path.join(wig_folder, wig_file))): Helper().merge_file( os.path.join(wig_folder, wig_file), os.path.join("tmp", "merge_forward.wig")) if (info[0][:-4] in wig_file) and (info[-1] == "-") and ( prefix in wig_file) and (os.path.isfile( os.path.join(wig_folder, wig_file))): Helper().merge_file( os.path.join(wig_folder, wig_file), os.path.join("tmp", "merge_reverse.wig")) def _check_orphan(self, prefixs, wig_folder, args_tss): '''if genome has no locus tag, it can use for classify the TSS''' for prefix in prefixs: self._merge_wigs(wig_folder, prefix, args_tss.libs) tmp_tss = os.path.join( self.tmps["tmp"], "_".join([prefix, args_tss.program + ".gff"])) pre_tss = os.path.join( self.gff_outfolder, "_".join([prefix, args_tss.program + ".gff"])) check_orphan(pre_tss, os.path.join(args_tss.gffs, prefix + ".gff"), "tmp/merge_forward.wig", "tmp/merge_reverse.wig", tmp_tss) shutil.move(tmp_tss, pre_tss) shutil.rmtree("tmp") def _remove_files(self, args_tss): print("Remove temperary files and folders...") self.helper.remove_tmp(args_tss.fastas) self.helper.remove_tmp(args_tss.gffs) self.helper.remove_tmp(args_tss.wig_folder) self.helper.remove_tmp(args_tss.ta_files) if "merge_forward.wig" in os.listdir(os.getcwd()): os.remove("merge_forward.wig") if "merge_reverse.wig" in os.listdir(os.getcwd()): os.remove("merge_reverse.wig") def _deal_with_overlap(self, out_folder, args_tss): '''deal with the situation that TSS and processing site at the same position''' if args_tss.overlap_feature.lower() == "both": pass else: print("Comparing TSS and Processing site...") if args_tss.program.lower() == "tss": for tss in os.listdir(out_folder): if tss.endswith("_TSS.gff"): ref = self.helper.get_correct_file( args_tss.references, "_processing.gff", tss.replace("_TSS.gff", ""), None, None) filter_tss_pro(os.path.join(out_folder, tss), ref, args_tss.overlap_feature, args_tss.cluster) elif args_tss.program.lower() == "processing_site": for tss in os.listdir(out_folder): if tss.endswith("_processing.gff"): ref = self.helper.get_correct_file( args_tss.references, "_TSS.gff", tss.replace("_processing.gff", ""), None, None) filter_tss_pro(os.path.join(out_folder, tss), ref, args_tss.overlap_feature, args_tss.cluster) def _low_expression(self, args_tss, gff_folder): '''deal with the low expressed TSS''' prefix = None self._merge_wigs(args_tss.wig_folder, "wig", args_tss.libs) for gff in os.listdir(gff_folder): if (args_tss.program.lower() == "tss") and (gff.endswith("_TSS.gff")): prefix = gff.replace("_TSS.gff", "") elif (args_tss.program.lower() == "processing") and (gff.endswith("_processing.gff")): prefix = gff.replace("_processing.gff", "") if prefix: out = open( os.path.join( self.stat_outfolder, prefix, "_".join(["stat", prefix, "low_expression_cutoff.csv"])), "w") out.write("\t".join(["strain", "cutoff_coverage"]) + "\n") cutoff = filter_low_expression( os.path.join(gff_folder, gff), args_tss, "tmp/merge_forward.wig", "tmp/merge_reverse.wig", "tmp/without_low_expression.gff") out.write("\t".join([prefix, str(cutoff)]) + "\n") os.remove(os.path.join(gff_folder, gff)) shutil.move("tmp/without_low_expression.gff", os.path.join(gff_folder, gff)) prefix = None out.close() def run_tsspredator(self, args_tss): input_folder = os.path.join(args_tss.out_folder, "configs") for gff in os.listdir(args_tss.gffs): if gff.endswith(".gff"): self.helper.check_uni_attributes( os.path.join(args_tss.gffs, gff)) self.helper.check_make_folder(self.gff_outfolder) self.multiparser.parser_fasta(args_tss.fastas) self.multiparser.parser_gff(args_tss.gffs, None) self.multiparser.parser_wig(args_tss.wig_folder) prefixs = self._set_gen_config(args_tss, input_folder) for prefix in prefixs: out_path = os.path.join(self.master, "_".join(["MasterTable", prefix])) config_file = os.path.join(input_folder, "_".join(["config", prefix]) + ".ini") self._start_to_run(args_tss.tsspredator_path, config_file, out_path, prefix) if os.path.exists(os.path.join(out_path, "TSSstatistics.tsv")): shutil.move( os.path.join(out_path, "TSSstatistics.tsv"), os.path.join(self.stat_outfolder, "TSSstatistics.tsv")) if args_tss.program.lower() == "processing_site": args_tss.program = "processing" self._convert_gff(prefixs, args_tss) if args_tss.check_orphan: print("checking the orphan TSS...") self._check_orphan(prefixs, os.path.join(args_tss.wig_folder, "tmp"), args_tss) self.multiparser.combine_gff(args_tss.gffs, self.gff_outfolder, None, args_tss.program) datas = [] for gff in os.listdir(self.gff_outfolder): if gff.endswith(".gff"): gff_folder = gff.replace( "".join(["_", args_tss.program, ".gff"]), "") self.helper.check_make_folder( os.path.join(self.stat_outfolder, gff_folder)) datas.append(gff_folder) if args_tss.remove_low_expression is not None: self._low_expression(args_tss, self.gff_outfolder) if args_tss.manual is not None: self.multiparser.combine_wig(args_tss.gffs, self.wig_path, None, args_tss.libs) self._merge_manual(datas, args_tss) self._deal_with_overlap(self.gff_outfolder, args_tss) if args_tss.stat: self._stat_tss(datas, args_tss.program) if args_tss.validate: self._validate(datas, args_tss) if args_tss.ta_files is not None: self._compare_ta(datas, args_tss) self._remove_files(args_tss)
class TranscriptDetection(object): '''doing for transcript detection''' def __init__(self, args_tran): self.multiparser = Multiparser() self.helper = Helper() self.converter = Converter() self.gff_outfolder = os.path.join(args_tran.out_folder, "gffs") self.tran_path = os.path.join(self.gff_outfolder, "tmp") self.stat_path = os.path.join(args_tran.out_folder, "statistics") self.tmps = {"gff": "tmp.gff", "merge": "tmp_merge", "tran": os.path.join(args_tran.out_folder, "tmp_tran"), "tss_ta": os.path.join(self.gff_outfolder, "tmp_tss_ta"), "ta_tss": os.path.join(self.gff_outfolder, "tmp_ta_tss"), "ta_gff": os.path.join(self.gff_outfolder, "tmp_ta_gff"), "gff_ta": os.path.join(self.gff_outfolder, "tmp_gff_ta"), "uni": os.path.join(self.gff_outfolder, "tmp_uni"), "overlap": os.path.join( self.gff_outfolder, "tmp_overlap")} self.frag = "transcript_fragment.gff" self.tex = "transcript_tex_notex.gff" self.endfix_tran = "transcript.gff" def _compute_transcript(self, wig_f, wig_r, wig_folder, wig_type, strain, libs, args_tran): print("Computing transcripts for {0}".format(strain)) out = os.path.join(args_tran.out_folder, "_".join([strain, wig_type])) detect_transcript(wig_f, wig_r, wig_folder, libs, out, wig_type, args_tran) def _compute(self, wig_type, wigs, libs, args_tran): strains = [] wig_folder = os.path.join(wigs, "tmp") for wig in os.listdir(wig_folder): if wig.endswith("_forward.wig"): strains.append(wig.replace("_forward.wig", "")) for strain in strains: f_file = os.path.join(wig_folder, "_".join( [strain, "forward.wig"])) r_file = os.path.join(wig_folder, "_".join( [strain, "reverse.wig"])) self._compute_transcript(f_file, r_file, wigs, wig_type, strain, libs, args_tran) return strains def _compare_tss(self, tas, args_tran, log): self.multiparser.parser_gff(args_tran.compare_tss, "TSS") self.multiparser.combine_gff( self.gff_outfolder, os.path.join(args_tran.compare_tss, "tmp"), "transcript", "TSS") print("Comaring of transcripts and TSSs") log.write("Running stat_TA_comparison.py to compare transcripts " "with TSSs.\n") tss_folder = os.path.join(args_tran.compare_tss, "tmp") for ta in tas: ta_file = os.path.join(self.gff_outfolder, "_".join([ta, self.endfix_tran])) stat_tss_out = os.path.join( self.stat_path, "".join([ "stat_compare_transcript_TSS_", ta, ".csv"])) for tss in os.listdir(tss_folder): filename = tss.split("_TSS") if (filename[0] == ta) and (tss.endswith(".gff")): stat_ta_tss(ta_file, os.path.join(tss_folder, tss), stat_tss_out, self.tmps["ta_tss"], self.tmps["tss_ta"], args_tran.fuzzy) os.remove(ta_file) os.remove(os.path.join(tss_folder, tss)) self.helper.sort_gff(self.tmps["ta_tss"], ta_file) self.helper.sort_gff( self.tmps["tss_ta"], os.path.join( args_tran.compare_tss, tss)) os.remove(self.tmps["tss_ta"]) os.remove(self.tmps["ta_tss"]) log.write("\t" + stat_tss_out + "\n") def _compare_cds(self, tas, args_tran, log): self.multiparser.parser_gff(args_tran.gffs, None) self.multiparser.combine_gff( self.gff_outfolder, os.path.join(args_tran.gffs, "tmp"), "transcript", None) print("Comaring of transcripts and genome annotations") cds_folder = os.path.join(args_tran.gffs, "tmp") log.write("Running stat_TA_comparison.py to compare transcripts " "with genome annotations.\n") for ta in tas: ta_file = os.path.join(self.gff_outfolder, "_".join([ta, self.endfix_tran])) stat_gff_out = os.path.join(self.stat_path, "".join([ "stat_compare_transcript_genome_", ta, ".csv"])) for gff in os.listdir(cds_folder): if (gff[:-4] == ta) and (gff.endswith(".gff")): cds_file = os.path.join(cds_folder, gff) stat_ta_gff(ta_file, cds_file, stat_gff_out, self.tmps["ta_gff"], self.tmps["gff_ta"], args_tran.c_feature) os.remove(ta_file) os.remove(os.path.join(args_tran.gffs, gff)) self.helper.sort_gff(self.tmps["ta_gff"], ta_file) self.helper.sort_gff(self.tmps["gff_ta"], os.path.join( args_tran.gffs, gff)) os.remove(self.tmps["ta_gff"]) os.remove(self.tmps["gff_ta"]) log.write("\t" + stat_gff_out + ".\n") def _compare_tss_cds(self, tas, args_tran, log): '''compare transcript with CDS and TSS''' if (args_tran.compare_tss is not None) and ( args_tran.c_feature is not None): self.multiparser.parser_gff(self.gff_outfolder, "transcript") self._compare_cds(tas, args_tran, log) self._compare_tss(tas, args_tran, log) elif (args_tran.c_feature is not None) and ( args_tran.compare_tss is None): self.multiparser.parser_gff(self.gff_outfolder, "transcript") self._compare_cds(tas, args_tran, log) elif (args_tran.c_feature is None) and ( args_tran.compare_tss is not None): self.multiparser.parser_gff(self.gff_outfolder, "transcript") self._compare_tss(tas, args_tran, log) def _for_one_wig(self, type_, args_tran): '''running transcript detection to one type of wig files''' if type_ == "tex_notex": libs = args_tran.tlibs wigs = args_tran.tex_wigs else: libs = args_tran.flibs wigs = args_tran.frag_wigs print("Importing {0} wig files".format(type_)) strains = self._compute(type_, wigs, libs, args_tran) for strain in strains: out = os.path.join(self.gff_outfolder, "_".join([ strain, "transcript", type_ + ".gff"])) print(os.path.join(args_tran.out_folder, "_".join([strain, type_]))) self.helper.sort_gff(os.path.join(args_tran.out_folder, "_".join([strain, type_])), out) os.remove(os.path.join(args_tran.out_folder, "_".join([strain, type_]))) return strains def _for_two_wigs(self, strains, args_tran, log): '''merge the results of fragemented and tex treated libs''' if (args_tran.frag_wigs is not None) and ( args_tran.tex_wigs is not None): log.write("Running combine_frag_tex.py to merge the results from " "fragmented libs and dRNA-Seq libs.\n") print("Merging fragmented and tex treated ones") for strain in strains: frag_gff = os.path.join(self.gff_outfolder, "_".join([strain, self.frag])) tex_gff = os.path.join(self.gff_outfolder, "_".join([strain, self.tex])) final_gff = os.path.join(self.gff_outfolder, "_".join([strain, self.endfix_tran])) for gff in os.listdir(self.gff_outfolder): if "_transcript_" in gff: filename = gff.split("_transcript_") if (strain == filename[0]) and ( "tex_notex.gff" == filename[1]): tex_file = gff elif (strain == filename[0]) and ( "fragment.gff" == filename[1]): frag_file = gff combine(os.path.join(self.gff_outfolder, frag_file), os.path.join(self.gff_outfolder, tex_file), args_tran.tolerance, os.path.join(self.gff_outfolder, "_".join([strain, self.endfix_tran]))) os.remove(frag_gff) os.remove(tex_gff) log.write("\t" + final_gff + " is generated.\n") else: if args_tran.frag_wigs is not None: for strain in strains: frag_gff = os.path.join( self.gff_outfolder, "_".join([strain, self.frag])) final_gff = os.path.join( self.gff_outfolder, "_".join([strain, self.endfix_tran])) shutil.move(frag_gff, final_gff) log.write("\t" + final_gff + " is generated.\n") elif args_tran.tex_wigs is not None: for strain in strains: tex_gff = os.path.join( self.gff_outfolder, "_".join([strain, self.tex])) final_gff = os.path.join( self.gff_outfolder, "_".join([strain, self.endfix_tran])) shutil.move(tex_gff, final_gff) log.write("\t" + final_gff + " is generated.\n") def _post_modify(self, tas, args_tran): '''modify the transcript by comparing with genome annotation''' for ta in tas: for gff in os.listdir(args_tran.gffs): if (".gff" in gff) and (gff[:-4] == ta): break print("Modifying {0} by refering to {1}".format(ta, gff)) fill_gap(os.path.join(args_tran.gffs, gff), os.path.join(self.tran_path, "_".join([ta, self.endfix_tran])), "overlap", self.tmps["overlap"], args_tran.modify) fill_gap(os.path.join(args_tran.gffs, gff), os.path.join(self.tran_path, "_".join([ta, self.endfix_tran])), "uni", self.tmps["uni"], args_tran.modify) tmp_merge = os.path.join(self.gff_outfolder, self.tmps["merge"]) if self.tmps["merge"] in self.gff_outfolder: os.remove(tmp_merge) self.helper.merge_file(self.tmps["overlap"], tmp_merge) self.helper.merge_file(self.tmps["uni"], tmp_merge) tmp_out = os.path.join(self.gff_outfolder, "_".join(["tmp", ta])) self.helper.sort_gff(tmp_merge, tmp_out) os.remove(self.tmps["overlap"]) os.remove(self.tmps["uni"]) os.remove(tmp_merge) final_out = os.path.join(self.gff_outfolder, "_".join(["final", ta])) longer_ta(tmp_out, args_tran.length, final_out) shutil.move(final_out, os.path.join(self.tmps["tran"], "_".join([ta, self.endfix_tran]))) os.remove(tmp_out) shutil.rmtree(self.gff_outfolder) shutil.move(self.tmps["tran"], self.gff_outfolder) def _remove_file(self, args_tran): if "tmp_wig" in os.listdir(args_tran.out_folder): shutil.rmtree(os.path.join(args_tran.out_folder, "tmp_wig")) if "merge_wigs" in os.listdir(args_tran.out_folder): shutil.rmtree(os.path.join(args_tran.out_folder, "merge_wigs")) self.helper.remove_tmp_dir(args_tran.gffs) self.helper.remove_tmp_dir(args_tran.compare_tss) self.helper.remove_tmp_dir(args_tran.terms) self.helper.remove_tmp(os.path.join(args_tran.out_folder, "gffs")) self.helper.remove_tmp(self.gff_outfolder) def _compare_term_tran(self, args_tran, log): '''searching the associated terminator to transcript''' if args_tran.terms is not None: print("Comparing between terminators and transcripts") self.multiparser.parser_gff(args_tran.terms, "term") if args_tran.gffs is not None: self.multiparser.combine_gff( args_tran.gffs, os.path.join(args_tran.terms, "tmp"), None, "term") log.write("Running compare_tran_term.py to compare transcripts " "with terminators.\n") compare_term_tran(self.gff_outfolder, os.path.join(args_tran.terms, "tmp"), args_tran.fuzzy_term, args_tran.fuzzy_term, args_tran.out_folder, "transcript", args_tran.terms, self.gff_outfolder) for file_ in os.listdir(os.path.join(args_tran.out_folder, "statistics")): if file_.startswith("stat_compare_transcript_terminator_"): log.write("\t" + file_ + " is generated.\n") def _re_table(self, args_tran, log): log.write("Running re_table.py to generate coverage information.\n") log.write("The following files are updated:\n") for gff in os.listdir(self.gff_outfolder): if os.path.isfile(os.path.join(self.gff_outfolder, gff)): tran_table = os.path.join(args_tran.out_folder, "tables", gff.replace(".gff", ".csv")) reorganize_table(args_tran.libs, args_tran.merge_wigs, "Coverage_details", tran_table) log.write("\t" + tran_table + "\n") def _list_files(self, folder, log, end): log.write("The following files in {0} are generated:\n".format(folder)) for file_ in os.listdir(folder): if (end is not None) and (file_.endswith(end)): log.write("\t" + file_ + "\n") elif end is None: log.write("\t" + file_ + "\n") def run_transcript(self, args_tran, log): if (args_tran.frag_wigs is None) and (args_tran.tex_wigs is None): log.write("No wig file is assigned.\n") print("Error: There is no wiggle file!\n") sys.exit() if args_tran.frag_wigs is not None: log.write("Running transcript_detection.py for detecting " "transcripts based on fragmented libs.\n") strains = self._for_one_wig("fragment", args_tran) if args_tran.tex_wigs is not None: log.write("Running transcript_detection.py for detecting " "transcripts based on dRNA-Seq libs.\n") strains = self._for_one_wig("tex_notex", args_tran) self._for_two_wigs(strains, args_tran, log) tas = [] if "none" not in args_tran.modify: for gff in os.listdir(args_tran.gffs): if gff.endswith(".gff"): self.helper.sort_gff(os.path.join(args_tran.gffs, gff), self.tmps["gff"]) shutil.move(self.tmps["gff"], os.path.join(args_tran.gffs, gff)) self.multiparser.combine_gff(args_tran.gffs, os.path.join( args_tran.gffs, "tmp"), None, None) self.multiparser.parser_gff(self.gff_outfolder, "transcript") self.multiparser.combine_gff(args_tran.gffs, self.tran_path, None, "transcript") self.helper.check_make_folder(self.tmps["tran"]) for ta in os.listdir(self.tran_path): if ta.endswith(".gff"): if os.path.getsize(os.path.join(self.tran_path, ta)) != 0: tas.append(ta.replace("_" + self.endfix_tran, "")) log.write("Running fill_gap.py to modify transcripts " "based on genome annotations.\n") self._post_modify(tas, args_tran) self._compare_tss_cds(tas, args_tran, log) self._compare_term_tran(args_tran, log) print("Generating tables for the details") log.write("Running gen_table_tran.py to generate the table of transcripts.\n") gen_table_transcript(self.gff_outfolder, args_tran) self._list_files(os.path.join(args_tran.out_folder, "tables"), log, None) log.write("Running plot_tran to plot the distribution of the length of " "the transcripts.\n") plot_tran(self.gff_outfolder, self.stat_path, args_tran.max_dist) self._list_files(self.stat_path, log, ".png") self._re_table(args_tran, log) self._remove_file(args_tran)
class Terminator(object): '''detection of terminator''' def __init__(self, args_term): self.multiparser = Multiparser() self.helper = Helper() self.converter = Converter() self.gff_parser = Gff3Parser() self.gff_path = os.path.join(args_term.gffs, "tmp") self.fasta_path = os.path.join(args_term.fastas, "tmp") self.tran_path = os.path.join(args_term.trans, "tmp") self.outfolder = { "term": os.path.join(args_term.out_folder, "gffs"), "csv": os.path.join(args_term.out_folder, "tables") } self.terms = { "all": os.path.join(self.outfolder["term"], "all_candidates"), "express": os.path.join(self.outfolder["term"], "expressed_candidates"), "best": os.path.join(self.outfolder["term"], "best_candidates"), "non": os.path.join(self.outfolder["term"], "non_expressed_candidates") } self.csvs = { "all": os.path.join(self.outfolder["csv"], "all_candidates"), "express": os.path.join(self.outfolder["csv"], "expressed_candidates"), "best": os.path.join(self.outfolder["csv"], "best_candidates"), "non": os.path.join(self.outfolder["csv"], "non_expressed_candidates") } self.combine_path = os.path.join(self.gff_path, "combine") self.tmps = { "transterm": os.path.join(os.getcwd(), "tmp_transterm"), "hp": "transtermhp", "hp_gff": "transtermhp.gff", "hp_path": "tmp_transterm/tmp", "term_table": os.path.join(os.getcwd(), "tmp_term_table"), "merge": os.path.join(os.getcwd(), "tmp_merge_gff"), "gff": "tmp.gff", "folder": os.path.join(os.getcwd(), "tmp") } self.suffixs = { "gff": "term.gff", "csv": "term.csv", "allgff": "term_all.gff" } if args_term.srnas: self.srna_path = os.path.join(args_term.srnas, "tmp") else: self.srna_path = None self._make_gff_folder() def _combine_annotation(self, combine_file, files): with open(combine_file, 'w') as result: for file_ in files: check_start = False fh = open(file_, 'r') for line in fh: if check_start: result.write(line) if "Location" in line: check_start = True if "\n" not in line: result.write("\n") fh.close() def _make_gff_folder(self): self.helper.check_make_folder(self.terms["all"]) self.helper.check_make_folder(self.csvs["all"]) self.helper.check_make_folder(self.terms["best"]) self.helper.check_make_folder(self.csvs["best"]) self.helper.check_make_folder(self.terms["express"]) self.helper.check_make_folder(self.csvs["express"]) self.helper.check_make_folder(self.terms["non"]) self.helper.check_make_folder(self.csvs["non"]) def _convert_gff2rntptt(self, gff_path, fasta_path, sRNAs): file_types = {} prefixs = [] for gff in os.listdir(gff_path): if gff.endswith(".gff"): filename = gff.split("/") prefix = filename[-1][:-4] prefixs.append(prefix) gff_file = os.path.join(gff_path, gff) rnt_file = os.path.join(gff_path, gff.replace(".gff", ".rnt")) ptt_file = os.path.join(gff_path, gff.replace(".gff", ".ptt")) fasta = self.helper.get_correct_file(fasta_path, ".fa", prefix, None, None) if not fasta: print("Error: {0}.fa can not be found!".format(prefix)) sys.exit() if sRNAs: self.multiparser.parser_gff(sRNAs, "sRNA") srna = self.helper.get_correct_file( self.srna_path, "_sRNA.gff", prefix, None, None) if (srna) and (fasta): self.converter.convert_gff2rntptt( gff_file, fasta, ptt_file, rnt_file, srna, srna.replace(".gff", ".rnt")) file_types[prefix] = "srna" if (not srna) and (fasta): self.converter.convert_gff2rntptt( gff_file, fasta, ptt_file, rnt_file, None, None) file_types[prefix] = "normal" else: self.converter.convert_gff2rntptt(gff_file, fasta, ptt_file, rnt_file, None, None) file_types[prefix] = "normal" return file_types, prefixs def _combine_ptt_rnt(self, gff_path, file_types, srna_path): self.helper.check_make_folder(self.combine_path) for prefix, file_type in file_types.items(): combine_file = os.path.join(self.combine_path, prefix + '.ptt') if file_type == "normal": files = [ os.path.join(gff_path, prefix + ".ptt"), os.path.join(gff_path, prefix + ".rnt") ] self._combine_annotation(combine_file, files) elif file_type == "srna": files = [ os.path.join(gff_path, prefix + ".ptt"), os.path.join(gff_path, prefix + ".rnt"), os.path.join(srna_path, "_".join([prefix, "sRNA.rnt"])) ] self._combine_annotation(combine_file, files) def _TransTermHP(self, fasta, file_, out_path, prefix, out, args_term): call([ args_term.TransTermHP_path, "-p", args_term.expterm_path, fasta, os.path.join(self.combine_path, file_), "--t2t-perf", os.path.join( out_path, "_".join([ prefix, "terminators_within_robust_tail-to-tail_regions.t2t" ])), "--bag-output", os.path.join(out_path, "_".join( [prefix, "best_terminator_after_gene.bag"])) ], stdout=out) def _run_TransTermHP(self, args_term): self.helper.check_make_folder(self.tmps["transterm"]) for file_ in os.listdir(self.combine_path): if ".ptt" in file_: prefix = file_.replace(".ptt", "") fasta = self.helper.get_correct_file(self.fasta_path, ".fa", prefix, None, None) if not fasta: print("Error: {0}.fa can not be found!".format(prefix)) sys.exit() out_path = os.path.join(args_term.hp_folder, prefix) self.helper.check_make_folder(out_path) out = open( os.path.join(out_path, "_".join([prefix, "terminators.txt"])), "w") self._TransTermHP(fasta, file_, out_path, prefix, out, args_term) out.close() shutil.rmtree(self.combine_path) def _convert_to_gff(self, prefixs, args_term): for prefix in prefixs: for folder in os.listdir(args_term.hp_folder): if prefix == folder: out_path = os.path.join(args_term.hp_folder, folder) for file_ in os.listdir(out_path): if file_.endswith(".bag"): out_file = os.path.join( self.tmps["transterm"], "_".join([prefix, self.tmps["hp_gff"]])) self.converter.convert_transtermhp2gff( os.path.join(out_path, file_), out_file) self.multiparser.combine_gff(args_term.gffs, self.tmps["transterm"], None, self.tmps["hp"]) def _combine_wigs(self, args_term): if (args_term.tex_wigs is not None) and (args_term.frag_wigs is not None): folder = args_term.tex_wigs.split("/") folder = "/".join(folder[:-1]) merge_wigs = os.path.join(folder, "merge_wigs") self.helper.check_make_folder(merge_wigs) for wig in os.listdir(args_term.tex_wigs): if os.path.isdir(os.path.join(args_term.tex_wigs, wig)): pass else: shutil.copy(os.path.join(args_term.tex_wigs, wig), merge_wigs) for wig in os.listdir(args_term.frag_wigs): if os.path.isdir(os.path.join(args_term.frag_wigs, wig)): pass else: shutil.copy(os.path.join(args_term.frag_wigs, wig), merge_wigs) elif (args_term.tex_wigs is not None): merge_wigs = args_term.tex_wigs elif (args_term.frag_wigs is not None): merge_wigs = args_term.frag_wigs else: print("Error: Wiggle files are not assigned!") sys.exit() return merge_wigs def _merge_sRNA(self, sRNAs, prefixs, gff_path): '''searching the terminator with sRNA information''' if sRNAs is not None: self.multiparser.parser_gff(sRNAs, "sRNA") self.helper.check_make_folder(self.tmps["merge"]) for prefix in prefixs: tmp_gff = os.path.join(self.tmps["merge"], self.tmps["gff"]) if self.tmps["gff"] in os.listdir(self.tmps["merge"]): os.remove(tmp_gff) self.helper.merge_file(os.path.join(gff_path, prefix + ".gff"), tmp_gff) self.helper.merge_file( os.path.join(self.srna_path, "_".join([prefix, "sRNA.gff"])), tmp_gff) self.helper.sort_gff( tmp_gff, os.path.join(self.tmps["merge"], prefix + ".gff")) os.remove(tmp_gff) merge_path = self.tmps["merge"] else: merge_path = gff_path return merge_path def _move_file(self, term_outfolder, csv_outfolder): for gff in os.listdir(term_outfolder): if gff.endswith("_term.gff"): self.helper.sort_gff(os.path.join(term_outfolder, gff), self.tmps["gff"]) shutil.move(self.tmps["gff"], os.path.join(term_outfolder, gff)) prefix = gff.replace("_term.gff", "") new_gff = os.path.join( self.terms["all"], "_".join([prefix, self.suffixs["allgff"]])) csv_file = os.path.join( os.path.join(self.csvs["all"], "_".join([prefix, self.suffixs["csv"]]))) out = open(new_gff, "w") out.write("##gff-version 3\n") out.close() self.helper.merge_file( os.path.join(term_outfolder, gff), os.path.join(self.terms["all"], "_".join([prefix, self.suffixs["allgff"]]))) os.remove(os.path.join(term_outfolder, gff)) pre_strain = "" if ("_".join([prefix, self.suffixs["csv"]]) in os.listdir(self.csvs["all"])): os.remove(csv_file) out_csv = open(csv_file, "w") out_csv.write("\t".join([ "Genome", "Name", "Start", "End", "Strand", "Detect", "Coverage_decrease", "Coverage_detail" ]) + "\n") out_csv.close() fh = open(new_gff) for entry in self.gff_parser.entries(fh): if entry.seq_id != pre_strain: self.helper.merge_file( os.path.join( self.tmps["term_table"], "_".join([entry.seq_id, "term_raw.csv"])), os.path.join( self.csvs["all"], "_".join([prefix, self.suffixs["csv"]]))) pre_strain = entry.seq_id fh.close() def _run_rnafold(self, RNAfold_path, tmp_seq, tmp_sec, prefix): print("Computing secondray structures of {0}".format(prefix)) self.helper.check_make_folder(self.tmps["folder"]) pre_cwd = os.getcwd() os.chdir(self.tmps["folder"]) os.system(" ".join([ RNAfold_path, "<", os.path.join("..", tmp_seq), ">", os.path.join("..", tmp_sec) ])) os.chdir(pre_cwd) shutil.rmtree(self.tmps["folder"]) def _compute_intersection_forward_reverse(self, prefixs, merge_path, wig_path, merge_wigs, args_term): '''the approach for searching gene converged region terminator''' for prefix in prefixs: tmp_seq = os.path.join(args_term.out_folder, "_".join(["inter_seq", prefix])) tmp_index = os.path.join(args_term.out_folder, "_".join(["inter_index", prefix])) tmp_sec = os.path.join(args_term.out_folder, "_".join(["inter_sec", prefix])) tran_file = os.path.join(self.tran_path, "_".join([prefix, "transcript.gff"])) gff_file = os.path.join(merge_path, prefix + ".gff") tmp_cand = tmp_cand = os.path.join( args_term.out_folder, "_".join(["term_candidates", prefix])) if os.path.exists(tran_file): print("Extracting sequences of {0}".format(prefix)) intergenic_seq(os.path.join(self.fasta_path, prefix + ".fa"), tran_file, gff_file, tmp_seq, tmp_index, args_term) self._run_rnafold(args_term.RNAfold_path, tmp_seq, tmp_sec, prefix) extract_info_sec(tmp_sec, tmp_seq, tmp_index) os.remove(tmp_index) poly_t(tmp_seq, tmp_sec, gff_file, tran_file, tmp_cand, args_term) print("Detecting terminators for " + prefix) detect_coverage( tmp_cand, os.path.join(merge_path, prefix + ".gff"), os.path.join(self.tran_path, "_".join([prefix, "transcript.gff"])), os.path.join(self.fasta_path, prefix + ".fa"), os.path.join(wig_path, "_".join([prefix, "forward.wig"])), os.path.join(wig_path, "_".join([prefix, "reverse.wig"])), os.path.join(self.tmps["hp_path"], "_".join([prefix, self.tmps["hp_gff"]])), merge_wigs, os.path.join(self.outfolder["term"], "_".join([prefix, self.suffixs["gff"]])), os.path.join(self.tmps["term_table"], "_".join([prefix, "term_raw.csv"])), args_term) self.multiparser.combine_gff(args_term.gffs, self.outfolder["term"], None, "term") self._move_file(self.outfolder["term"], self.outfolder["csv"]) def _remove_tmp_file(self, merge_wigs, args_term): self.helper.remove_tmp_dir(args_term.gffs) self.helper.remove_tmp_dir(args_term.fastas) if args_term.srnas is not None: self.helper.remove_tmp(args_term.srnas) shutil.rmtree(self.tmps["merge"]) if (args_term.tex_wigs is not None) and (args_term.frag_wigs is not None): shutil.rmtree(merge_wigs) self.helper.remove_tmp_dir(args_term.trans) if "tmp_wig" in os.listdir(args_term.out_folder): shutil.rmtree(os.path.join(args_term.out_folder, "tmp_wig")) self.helper.remove_tmp(self.outfolder["term"]) shutil.rmtree(self.tmps["transterm"]) shutil.rmtree(self.tmps["term_table"]) self.helper.remove_all_content(args_term.out_folder, "inter_seq_", "file") self.helper.remove_all_content(self.outfolder["term"], "_term.gff", "file") self.helper.remove_all_content(args_term.out_folder, "inter_sec_", "file") self.helper.remove_all_content(args_term.out_folder, "term_candidates_", "file") def _compute_stat(self, args_term): new_prefixs = [] for gff in os.listdir(self.terms["all"]): if gff.endswith("_term_all.gff"): out_tmp = open(self.tmps["gff"], "w") out_tmp.write("##gff-version 3\n") new_prefix = gff.replace("_term_all.gff", "") new_prefixs.append(gff.replace("_term_all.gff", "")) num = 0 fh = open(os.path.join(self.terms["all"], gff)) for entry in self.gff_parser.entries(fh): name = '%0*d' % (5, num) entry.attributes["ID"] = (entry.seq_id + "_terminator" + str(num)) entry.attributes["Name"] = "_".join(["terminator_" + name]) entry.attribute_string = ";".join([ "=".join(items) for items in entry.attributes.items() ]) out_tmp.write("\t".join([ entry.info_without_attributes, entry.attribute_string ]) + "\n") num += 1 out_tmp.close() fh.close() shutil.move( self.tmps["gff"], os.path.join(self.terms["all"], "_".join([new_prefix, self.suffixs["gff"]]))) stat_path = os.path.join(args_term.out_folder, "statistics") for prefix in new_prefixs: stat_term( os.path.join(self.terms["all"], "_".join([prefix, self.suffixs["gff"]])), os.path.join(self.csvs["all"], "_".join([prefix, self.suffixs["csv"]])), os.path.join(stat_path, "_".join(["stat", prefix + ".csv"])), os.path.join(self.terms["best"], "_".join([prefix, "term"])), os.path.join(self.terms["express"], "_".join([prefix, "term"])), os.path.join(self.terms["non"], "_".join([prefix, "term"]))) shutil.move( os.path.join(self.terms["best"], "_".join([prefix, self.suffixs["csv"]])), os.path.join(self.csvs["best"], "_".join([prefix, self.suffixs["csv"]]))) shutil.move( os.path.join(self.terms["express"], "_".join([prefix, self.suffixs["csv"]])), os.path.join(self.csvs["express"], "_".join([prefix, self.suffixs["csv"]]))) shutil.move( os.path.join(self.terms["non"], "_".join([prefix, self.suffixs["csv"]])), os.path.join(self.csvs["non"], "_".join([prefix, self.suffixs["csv"]]))) os.remove( os.path.join(self.terms["all"], "_".join([prefix, self.suffixs["allgff"]]))) def _check_gff_file(self, folder): for file_ in os.listdir(folder): if file_.endswith(".gff"): self.helper.check_uni_attributes(os.path.join(folder, file_)) def _compare_term_tran(self, args_term, prefixs): '''searching the associated terminator to transcript''' self.multiparser.combine_gff(args_term.gffs, self.tran_path, None, "transcript") prefixs = [] print("Comparing terminators with transcripts now") for file_ in os.listdir(self.tran_path): if file_.endswith("_transcript.gff"): prefixs.append(file_.replace("_transcript.gff", "")) for type_ in ("best_candidates", "expressed_candidates", "all_candidates"): compare_term_tran(self.tran_path, os.path.join(self.outfolder["term"], type_), args_term.fuzzy_up_ta, args_term.fuzzy_down_ta, args_term.out_folder, "terminator", self.outfolder["term"], args_term.trans) for prefix in prefixs: shutil.move( os.path.join( args_term.out_folder, "statistics", "stat_compare_transcript_terminator_" + prefix + ".csv"), os.path.join( args_term.out_folder, "statistics", "_".join([ "stat_compare_terminator_transcript", prefix, type_ + ".csv" ]))) def run_terminator(self, args_term): self._check_gff_file(args_term.gffs) self._check_gff_file(args_term.trans) self.multiparser.parser_fasta(args_term.fastas) if (not args_term.gffs) or (not args_term.fastas): print("Error: Please assign gff files " "and fasta files!") sys.exit() file_types, prefixs = self._convert_gff2rntptt(self.gff_path, self.fasta_path, args_term.srnas) self._combine_ptt_rnt(self.gff_path, file_types, self.srna_path) self._run_TransTermHP(args_term) self._convert_to_gff(prefixs, args_term) self.helper.remove_tmp(self.gff_path) self.multiparser.parser_gff(args_term.trans, "transcript") self.helper.check_make_folder(self.tmps["term_table"]) self.multiparser.parser_gff(self.tmps["transterm"], self.tmps["hp"]) merge_path = self._merge_sRNA(args_term.srnas, prefixs, self.gff_path) self._compute_intersection_forward_reverse(prefixs, merge_path, args_term.wig_path, args_term.merge_wigs, args_term) self._compute_stat(args_term) self._compare_term_tran(args_term, prefixs) self._remove_tmp_file(args_term.merge_wigs, args_term)
def __init__(self): self.multiparser = Multiparser() self.helper = Helper()
class ArgsContainer(object): def __init__(self): self.multiparser = Multiparser() self.helper = Helper() def _check_replicates(self, replicates_tex, replicates_frag): if (replicates_tex is not None) and (replicates_frag is not None): replicates = {"tex": int(replicates_tex), "frag": int(replicates_frag)} elif replicates_tex is not None: replicates = {"tex": int(replicates_tex), "frag": -1} elif replicates_frag is not None: replicates = {"tex": -1, "frag": int(replicates_frag)} else: print("Error:No replicates number assign!!!") sys.exit() return replicates def _check_libs(self, tex_notex_libs, frag_libs): if (tex_notex_libs is None) and (frag_libs is None): print("Error: please input proper libraries!!") if (tex_notex_libs is not None) and (frag_libs is not None): libs = tex_notex_libs + frag_libs elif (tex_notex_libs is not None): libs = tex_notex_libs elif (frag_libs is not None): libs = frag_libs return libs def _parser_combine_wigs(self, subcommand): self.tex_path = None self.frag_path = None self.multiparser.parser_gff(self.gffs, None) if subcommand == "terminator": gff_path = os.path.join(self.gffs, "tmp") self.multiparser.parser_gff(gff_path, None) else: gff_path = self.gffs if self.tex_wigs is not None: self.tex_path = os.path.join(self.tex_wigs, "tmp") self.multiparser.parser_wig(self.tex_wigs) self.multiparser.combine_wig(gff_path, self.tex_path, None, self.libs) self.merge_wigs = self.tex_wigs self.wig_path = self.tex_path if self.frag_wigs is not None: self.frag_path = os.path.join(self.frag_wigs, "tmp") self.multiparser.parser_wig(self.frag_wigs) self.multiparser.combine_wig(gff_path, self.frag_path, None, self.libs) self.merge_wigs = self.frag_wigs self.wig_path = self.frag_path if (self.tex_path is not None) and ( self.frag_path is not None): self = self._merge_wig() if (self.tex_path is None) and ( self.frag_path is None): print("Error: There is no proper wig files assigned!!") sys.exit() return self def _merge_wig(self): self.merge_wigs = os.path.join(self.out_folder, "merge_wigs") if (self.tex_wigs is not None) and ( self.frag_wigs is not None): self.helper.check_make_folder(self.merge_wigs) self.wig_path = os.path.join(self.merge_wigs, "tmp") self.helper.check_make_folder(self.wig_path) for wig in os.listdir(self.tex_wigs): if os.path.isfile(os.path.join(self.tex_wigs, wig)): shutil.copy(os.path.join(self.tex_wigs, wig), self.merge_wigs) for wig in os.listdir(self.frag_wigs): if os.path.isfile(os.path.join(self.frag_wigs, wig)): shutil.copy(os.path.join(self.frag_wigs, wig), self.merge_wigs) for wig in os.listdir(self.tex_path): if os.path.isfile(os.path.join(self.tex_path, wig)): shutil.copy(os.path.join(self.tex_path, wig), self.wig_path) for wig in os.listdir(self.frag_path): if os.path.isfile(os.path.join(self.frag_path, wig)): self.helper.merge_file(os.path.join(self.frag_path, wig), os.path.join(self.wig_path, wig)) elif (self.tex_wigs is not None): self.merge_wigs = self.tex_wigs elif (self.frag_wigs is not None): self.merge_wigs = self.frag_wigs return self def _deal_multi_inputs(self, inputs, file_type, num, command): if inputs is not None: datas = inputs.split(",") if num is not None: if (len(datas) != num): print("Error: the amount of {0} is not correct!!".format( command)) new_inputs = [] for data in datas: if file_type == "float": new_inputs.append(float(data.strip())) elif file_type == "int": new_inputs.append(int(data.strip())) else: new_inputs.append(data) return new_inputs else: return inputs def container_ratt(self, ratt_path, element, transfer_type, ref_embl_gbk, target_fasta, ref_fasta, ratt_folder, convert_to_gff_rnt_ptt, tar_annotation_folder, compare_pair): self.ratt_path = ratt_path self.element = element self.transfer_type = transfer_type self.ref_embls = ref_embl_gbk self.tar_fastas = target_fasta self.ref_fastas = ref_fasta self.output_path = ratt_folder self.convert = convert_to_gff_rnt_ptt self.gff_outfolder = tar_annotation_folder self.pairs = self._deal_multi_inputs(compare_pair, "str", None, None) return self def container_tsspredator(self, TSSpredator_path, compute_program, fasta_folder, annotation_folder, wig_folder, lib, output_prefix, height, height_reduction, factor, factor_reduction, base_height, enrichment_factor, processing_factor, replicate_match, out_folder, statistics, validate_gene, merge_manual, compare_transcript_assembly, fuzzy, utr_length, cluster, length, re_check_orphan, overlap_feature, reference_gff_folder, remove_low_expression): self.tsspredator_path = TSSpredator_path self.program = compute_program self.fastas = fasta_folder self.gffs = annotation_folder self.wig_folder = wig_folder self.libs = self._deal_multi_inputs(lib, "str", None, None) self.output_prefixs = self._deal_multi_inputs(output_prefix, "str", None, None) self.height = height self.height_reduction = height_reduction self.factor = factor self.factor_reduction = factor_reduction self.base_height = base_height self.enrichment_factor = enrichment_factor self.processing_factor = processing_factor self.repmatch = replicate_match self.out_folder = out_folder self.stat = statistics self.validate = validate_gene self.manual = merge_manual self.ta_files = compare_transcript_assembly self.fuzzy = fuzzy self.utr_length = utr_length self.cluster = cluster self.nt_length = length self.check_orphan = re_check_orphan self.overlap_feature = overlap_feature self.references = reference_gff_folder self.remove_low_expression = remove_low_expression return self def container_optimize(self, TSSpredator_path, fasta_file, annotation_file, wig_folder, manual, out_folder, strain_name, max_height, max_height_reduction, max_factor, max_factor_reduction, max_base_height, max_enrichment_factor, max_processing_factor, utr_length, lib, output_prefix, cluster, length, core, program, replicate_match, steps): self.tsspredator_path = TSSpredator_path self.fastas = fasta_file self.gffs = annotation_file self.wigs = wig_folder self.manual = manual self.output_folder = out_folder self.project_strain = strain_name self.height = max_height self.height_reduction = max_height_reduction self.factor = max_factor self.factor_reduction = max_factor_reduction self.base_height = max_base_height self.enrichment = max_enrichment_factor self.processing = max_processing_factor self.utr = utr_length self.libs = self._deal_multi_inputs(lib, "str", None, None) self.replicate_name = self._deal_multi_inputs(output_prefix, "str", None, None) self.cluster = cluster self.length = length self.cores = core self.program = program self.replicate = replicate_match self.steps = steps return self def container_terminator( self, TransTermHP_path, expterm_path, RNAfold_path, out_folder, fasta_folder, annotation_folder, transcript_folder, srna, statistics, tex_wig_folder, frag_wig_folder, decrease, highest_coverage, fuzzy_detect_coverage, fuzzy_within_transcript, fuzzy_downstream_transcript, fuzzy_within_gene, fuzzy_downstream_gene, transtermhp_folder, tex_notex_libs, frag_libs, tex_notex, replicates_tex, replicates_frag, table_best, min_loop_length, max_loop_length, min_stem_length, max_stem_length, min_AT_tail_length, miss_rate, range_u): self.TransTermHP_path = TransTermHP_path self.expterm_path = expterm_path self.RNAfold_path = RNAfold_path self.out_folder = out_folder self.fastas = fasta_folder self.gffs = annotation_folder self.trans = transcript_folder self.srnas = srna self.stat = statistics self.tex_wigs = tex_wig_folder self.frag_wigs = frag_wig_folder self.decrease = decrease self.cutoff_coverage = highest_coverage self.fuzzy = fuzzy_detect_coverage self.fuzzy_up_ta = fuzzy_within_transcript self.fuzzy_down_ta = fuzzy_downstream_transcript self.fuzzy_up_gene = fuzzy_within_gene self.fuzzy_down_gene = fuzzy_downstream_gene self.hp_folder = transtermhp_folder self.tlibs = self._deal_multi_inputs(tex_notex_libs, "str", None, None) self.flibs = self._deal_multi_inputs(frag_libs, "str", None, None) self.libs = self._check_libs(self.tlibs, self.flibs) self.tex_notex = tex_notex self.replicates_tex = replicates_tex self.replicates_frag = replicates_frag self.replicates = self._check_replicates( replicates_tex, replicates_frag) self.table_best = table_best self.min_loop = min_loop_length self.max_loop = max_loop_length self.min_stem = min_stem_length self.max_stem = max_stem_length self.at_tail = min_AT_tail_length self.miss_rate = miss_rate self.range_u = range_u self = self._parser_combine_wigs("terminator") return self def container_transcript( self, frag_wig_path, tex_wig_path, tex_notex, length, annotation_folder, height, width, tolerance, tolerance_coverage, replicates_tex, replicates_frag, transcript_assembly_output_folder, compare_TSS, compare_genome_annotation, TSS_fuzzy, tex_treated_libs, fragmented_libs, compare_feature_genome, table_best, terminator_folder, fuzzy_term): self.frag_wigs = frag_wig_path self.tex_wigs = tex_wig_path self.tex = tex_notex self.length = length self.gffs = annotation_folder self.height = height self.width = width self.tolerance = tolerance self.low_cutoff = tolerance_coverage self.replicates_tex = replicates_tex self.replicates_frag = replicates_frag self.replicates = self._check_replicates( replicates_tex, replicates_frag) self.out_folder = transcript_assembly_output_folder self.compare_tss = compare_TSS self.compare_cds = compare_genome_annotation self.fuzzy = TSS_fuzzy self.tlibs = self._deal_multi_inputs(tex_treated_libs, "str", None, None) self.flibs = self._deal_multi_inputs(fragmented_libs, "str", None, None) self.libs = self._check_libs(self.tlibs, self.flibs) self.c_feature = self._deal_multi_inputs(compare_feature_genome, "str", None, None) self.table_best = table_best self.terms = terminator_folder self.fuzzy_term = fuzzy_term self = self._parser_combine_wigs("transcript") return self def container_utr(self, tss_folder, annotation_folder, transcript_assembly_folder, terminator_folder, terminator_fuzzy, utr_folder, tss_source, base_5utr, length, base_3utr): self.tsss = tss_folder self.gffs = annotation_folder self.trans = transcript_assembly_folder self.terms = terminator_folder self.fuzzy = terminator_fuzzy self.out_folder = utr_folder self.source = tss_source self.base_5utr = base_5utr self.base_3utr = base_3utr self.length = length return self def container_srna( self, Vienna_folder, Vienna_utils, blast_plus_folder, ps2pdf14_path, srna_folder, UTR_derived_sRNA, annotation_folder, TSS_folder, transcript_assembly_folder, TSS_intergenic_fuzzy, TSS_5UTR_fuzzy, TSS_3UTR_fuzzy, TSS_interCDS_fuzzy, import_info, tex_wig_folder, frag_wig_folder, processing_site_folder, fasta_folder, mountain_plot, nr_format, srna_format, sRNA_database_path, nr_database_path, cutoff_energy, run_intergenic_TEX_coverage, run_intergenic_noTEX_coverage, run_intergenic_fragmented_coverage, run_antisense_TEX_coverage, run_antisense_noTEX_coverage, run_antisense_fragmented_coverage, intergenic_tolerance, run_utr_TEX_coverage, run_utr_noTEX_coverage, run_utr_fragmented_coverage, max_length, min_length, tex_notex_libs, frag_libs, replicates_tex, replicates_frag, tex_notex, blast_e_nr, blast_e_srna, detect_sRNA_in_CDS, table_best, decrease_intergenic, decrease_utr, fuzzy_intergenic, fuzzy_utr, cutoff_nr_hit, sORF, best_with_all_sRNAhit, best_without_sORF_candidate, overlap_percent_CDS, terminator_folder, terminator_fuzzy_in_CDS, terminator_fuzzy_out_CDS, best_with_terminator, ignore_hypothetical_protein, TSS_source, min_utr_coverage, promoter_table, best_with_promoter, ranking_promoter, promoter_name): self.vienna_path = Vienna_folder self.vienna_util = Vienna_utils self.blast_path = blast_plus_folder self.ps2pdf14_path = ps2pdf14_path self.out_folder = srna_folder self.utr_srna = UTR_derived_sRNA self.gffs = annotation_folder self.tss_folder = TSS_folder self.trans = transcript_assembly_folder self.fuzzy_inter_tss = TSS_intergenic_fuzzy self.fuzzy_5utr_tss = TSS_5UTR_fuzzy self.fuzzy_3utr_tss = TSS_3UTR_fuzzy self.fuzzy_intercds_tss = TSS_interCDS_fuzzy self.fuzzy_tsss = {"5utr": self.fuzzy_5utr_tss, "3utr": self.fuzzy_3utr_tss, "interCDS": self.fuzzy_intercds_tss, "inter": self.fuzzy_inter_tss} self.import_info = self._deal_multi_inputs(import_info, "str", None, None) self.tex_wigs = tex_wig_folder self.frag_wigs = frag_wig_folder self.pro_folder = processing_site_folder self.fastas = fasta_folder self.mountain = mountain_plot self.nr_format = nr_format self.srna_format = srna_format self.srna_database = sRNA_database_path self.nr_database = nr_database_path self.energy = cutoff_energy self.coverage_tex = self._deal_multi_inputs( run_intergenic_TEX_coverage, "float", 5, "--run_intergenic_TEX_coverage") self.coverage_notex = self._deal_multi_inputs( run_intergenic_noTEX_coverage, "float", 5, "--run_intergenic_noTEX_coverage") self.coverage_frag = self._deal_multi_inputs( run_intergenic_fragmented_coverage, "float", 5, "--run_intergenic_fragmented_coverage") self.anti_cover_tex = self._deal_multi_inputs( run_antisense_TEX_coverage, "float", 5, "--run_antisense_TEX_coverage") self.anti_cover_notex = self._deal_multi_inputs( run_antisense_noTEX_coverage, "float", 5, "--run_antisense_noTEX_coverage") self.anti_cover_frag = self._deal_multi_inputs( run_antisense_fragmented_coverage, "float", 5, "--run_antisense_fragmented_coverage") self.tolerance = intergenic_tolerance self.utr_tex_cover = self._deal_multi_inputs( run_utr_TEX_coverage, "str", 3, "--run_utr_TEX_coverage") self.utr_notex_cover = self._deal_multi_inputs( run_utr_noTEX_coverage, "str", 3, "--run_utr_TEX_coverage") self.utr_frag_cover = self._deal_multi_inputs( run_utr_fragmented_coverage, "str", 3, "--run_utr_fragmented_coverage") self.max_len = max_length self.min_len = min_length self.tlibs = self._deal_multi_inputs(tex_notex_libs, "str", None, None) self.flibs = self._deal_multi_inputs(frag_libs, "str", None, None) self.libs = self._check_libs(self.tlibs, self.flibs) self.replicates_tex = replicates_tex self.replicates_frag = replicates_frag self.replicates = self._check_replicates( replicates_tex, replicates_frag) self.tex_notex = tex_notex self.e_nr = blast_e_nr self.e_srna = blast_e_srna self.in_cds = detect_sRNA_in_CDS self.table_best = table_best self.decrease_inter = decrease_intergenic self.decrease_utr = decrease_utr self.fuzzy_inter = fuzzy_intergenic self.fuzzy_utr = fuzzy_utr self.nr_hits_num = cutoff_nr_hit self.sorf_file = sORF self.all_hit = best_with_all_sRNAhit self.best_sorf = best_without_sORF_candidate self.cutoff_overlap = overlap_percent_CDS self.terms = terminator_folder self.fuzzy_b = terminator_fuzzy_in_CDS self.fuzzy_a = terminator_fuzzy_out_CDS self.best_term = best_with_terminator self.hypo = ignore_hypothetical_protein self.tss_source = TSS_source self.min_utr = min_utr_coverage self.promoter_table = promoter_table self.best_promoter = best_with_promoter if ranking_promoter < 1: print("Error: --ranking_time_promoter must larger than 1...") sys.exit() self.rank_promoter = ranking_promoter self.promoter_name = self._deal_multi_inputs(promoter_name, "str", None, None) self = self._parser_combine_wigs("srna") return self def container_intersrna(self, file_type, files, args_srna, prefix, gff_file, tran_file, tss_file, pro_file, fuzzy): args_srna.file_type = file_type args_srna.gff_file = gff_file args_srna.tran_file = tran_file args_srna.tss_file = tss_file args_srna.pro_file = pro_file args_srna.fuzzy = fuzzy args_srna.prefix = prefix if file_type == "frag": args_srna.wig_f_file = os.path.join( args_srna.frag_path, "_".join([prefix, "forward.wig"])) args_srna.wig_r_file = os.path.join( args_srna.frag_path, "_".join([prefix, "reverse.wig"])) args_srna.wig_folder = args_srna.frag_wigs args_srna.input_libs = args_srna.flibs args_srna.output_file = files["frag_gff"] args_srna.output_table = files["frag_csv"] args_srna.cutoffs = args_srna.coverage_frag args_srna.tss_source = True args_srna.cut_notex = None args_srna.anti_notex_cutoff = None else: args_srna.wig_f_file = os.path.join( args_srna.tex_path, "_".join([prefix, "forward.wig"])) args_srna.wig_r_file = os.path.join( args_srna.tex_path, "_".join([prefix, "reverse.wig"])) args_srna.wig_folder = args_srna.tex_wigs args_srna.input_libs = args_srna.tlibs args_srna.output_file = files["tex_gff"] args_srna.output_table = files["tex_csv"] args_srna.cutoffs = args_srna.coverage_tex args_srna.tss_source = args_srna.tss_source args_srna.cut_notex = args_srna.coverage_notex args_srna.anti_notex_cutoff = args_srna.anti_cover_notex return args_srna def container_utrsrna(self, gff, tran, tss, files, pro, fasta, file_type, prefix, args_srna): args_srna.file_type = file_type args_srna.gff_file = gff args_srna.ta_file = tran args_srna.tss_file = tss args_srna.pro_file = pro args_srna.prefix = prefix args_srna.seq_file = fasta if file_type == "frag": args_srna.wig_f_file = os.path.join( args_srna.frag_path, "_".join([prefix, "forward.wig"])) args_srna.wig_r_file = os.path.join( args_srna.frag_path, "_".join([prefix, "reverse.wig"])) args_srna.wig_folder = args_srna.frag_wigs args_srna.input_libs = args_srna.flibs args_srna.output_file = files["frag_gff"] args_srna.output_table = files["frag_csv"] args_srna.utr_coverages = args_srna.utr_frag_cover args_srna.notex = None else: args_srna.wig_f_file = os.path.join( args_srna.tex_path, "_".join([prefix, "forward.wig"])) args_srna.wig_r_file = os.path.join( args_srna.tex_path, "_".join([prefix, "reverse.wig"])) args_srna.wig_folder = args_srna.tex_wigs args_srna.input_libs = args_srna.tlibs args_srna.output_file = files["tex_gff"] args_srna.output_table = files["tex_csv"] args_srna.utr_coverages = args_srna.utr_tex_cover args_srna.notex = args_srna.utr_notex_cover args_srna.coverages = {"5utr": args_srna.utr_coverages[0], "3utr": args_srna.utr_coverages[1], "interCDS": args_srna.utr_coverages[2]} if args_srna.notex is not None: args_srna.cover_notex = {"5utr": args_srna.notex[0], "3utr": args_srna.notex[1], "interCDS": args_srna.notex[2]} else: args_srna.cover_notex = None return args_srna def extend_inter_container(self, args_srna, tsss, pros, wigs_f, wigs_r, nums, output, out_table, texs, detects, cutoff_coverage, notex): args_srna.tsss = tsss args_srna.pros = pros args_srna.wigs_f = wigs_f args_srna.wigs_r = wigs_r args_srna.nums = nums args_srna.output = output args_srna.out_table = out_table args_srna.texs = texs args_srna.detects = detects args_srna.cutoff_coverage = cutoff_coverage args_srna.notex = notex return args_srna def extend_utr_container(self, args_srna, cdss, tsss, pros, wig_fs, wig_rs, out, out_t, texs): args_srna.cdss = cdss args_srna.tsss = tsss args_srna.pros = pros args_srna.wig_fs = wig_fs args_srna.wig_rs = wig_rs args_srna.out = out args_srna.out_t = out_t args_srna.texs = texs args_srna.utrs = [] args_srna.srnas = [] return args_srna def container_sorf(self, sorf_folder, UTR_derived_sORF, transcript_folder, annotation_folder, TSS_folder, utr_length, min_length, max_length, tex_wig_folder, frag_wig_folder, cutoff_intergenic_coverage, cutoff_antisense_coverage, cutoff_5utr_coverage, cutoff_3utr_coverage, cutoff_interCDS_coverage, fasta_folder, tex_notex_libs, frag_libs, tex_notex, replicates_tex, replicates_frag, table_best, sRNA_folder, start_codon, stop_codon, cutoff_background, fuzzy_rbs, rbs_not_after_TSS, print_all_combination, best_no_sRNA, best_no_TSS, ignore_hypothetical_protein, min_rbs_distance, max_rbs_distance): self.out_folder = sorf_folder self.utr_detect = UTR_derived_sORF self.trans = transcript_folder self.gffs = annotation_folder self.tsss = TSS_folder self.utr_length = utr_length self.min_len = min_length self.max_len = max_length self.tex_wigs = tex_wig_folder self.frag_wigs = frag_wig_folder self.cutoff_inter = cutoff_intergenic_coverage self.cutoff_anti = cutoff_antisense_coverage self.cutoff_5utr = cutoff_5utr_coverage self.cutoff_3utr = cutoff_3utr_coverage self.cutoff_intercds = cutoff_interCDS_coverage self.fastas = fasta_folder self.tlibs = self._deal_multi_inputs(tex_notex_libs, "str", None, None) self.flibs = self._deal_multi_inputs(frag_libs, "str", None, None) self.libs = self._check_libs(self.tlibs, self.flibs) self.tex_notex = tex_notex self.replicates_tex = replicates_tex self.replicates_frag = replicates_frag self.replicates = self._check_replicates( replicates_tex, replicates_frag) self.table_best = table_best self.srnas = sRNA_folder self.start_codon = self._deal_multi_inputs(start_codon, "str", None, None) self.stop_codon = self._deal_multi_inputs(stop_codon, "str", None, None) self.background = cutoff_background self.fuzzy_rbs = fuzzy_rbs self.noafter_tss = rbs_not_after_TSS self.print_all = print_all_combination self.no_srna = best_no_sRNA self.no_tss = best_no_TSS self.hypo = ignore_hypothetical_protein self.min_rbs = min_rbs_distance self.max_rbs = max_rbs_distance self = self._parser_combine_wigs("sorf") return self def container_srna_target(self, Vienna_folder, annotation_path, fasta_path, sRNA_path, query_sRNA, program, interaction_length, window_size_target, span_target, window_size_srna, span_srna, unstructured_region_RNAplex_target, unstructured_region_RNAplex_srna, unstructured_region_RNAup, energy_threshold, duplex_distance, top, starget_output_folder, process_rnaplex, process_rnaup, continue_rnaup, potential_target_start, potential_target_end, target_feature): self.vienna_path = Vienna_folder self.gffs = annotation_path self.fastas = fasta_path self.srnas = sRNA_path self.query = self._deal_multi_inputs(query_sRNA, "str", None, None) self.program = program self.inter_length = interaction_length self.win_size_t = window_size_target self.span_t = span_target self.win_size_s = window_size_srna self.span_s = span_srna self.unstr_region_rnaplex_t = unstructured_region_RNAplex_target self.unstr_region_rnaplex_s = unstructured_region_RNAplex_srna self.unstr_region_rnaup = unstructured_region_RNAup self.energy = energy_threshold self.duplex_dist = duplex_distance self.top = top self.out_folder = starget_output_folder self.core_plex = process_rnaplex self.core_up = process_rnaup self.continue_rnaup = continue_rnaup self.tar_start = potential_target_start self.tar_end = potential_target_end self.features = self._deal_multi_inputs(target_feature, "str", None, None) return self def container_goterm(self, annotation_path, goterm_output_folder, UniProt_id, go_obo, goslim_obo, transcript_path): self.gffs = annotation_path self.out_folder = goterm_output_folder self.uniprot = UniProt_id self.go = go_obo self.goslim = goslim_obo self.trans = transcript_path return self def container_sublocal(self, Psortb_path, gff_path, fasta_path, bacteria_type, difference_multi, merge_to_gff, sublocal_output_folder, transcript_path): self.psortb_path = Psortb_path self.gffs = gff_path self.fastas = fasta_path self.gram = bacteria_type self.fuzzy = difference_multi self.merge = merge_to_gff self.out_folder = sublocal_output_folder self.trans = transcript_path return self def container_ppi(self, gff_path, proteinID_strains, without_strain_pubmed, species_STRING, score, ppi_output_folder, node_size, query): self.ptts = gff_path self.strains = self._deal_multi_inputs(proteinID_strains, "str", None, None) self.no_specific = without_strain_pubmed self.species = species_STRING self.score = score self.out_folder = ppi_output_folder self.size = node_size self.querys = self._deal_multi_inputs(query, "str", None, None) return self def container_promoter(self, MEME_path, promoter_output_folder, tex_libs, TSS_folder, fasta_folder, num_motif, nt_before_TSS, motif_width, TSS_source, tex_wig_path, annotation_folder, combine_all, e_value): self.meme_path = MEME_path self.output_folder = promoter_output_folder self.input_libs = self._deal_multi_inputs(tex_libs, "str", None, None) self.tsss = TSS_folder self.fastas = fasta_folder self.num_motif = num_motif self.nt_before = nt_before_TSS self.widths = self._deal_multi_inputs(motif_width, "str", None, None) self.source = TSS_source self.wigs = tex_wig_path self.gffs = annotation_folder self.combine = combine_all self.e_value = e_value return self def container_operon(self, TSS_folder, annotation_folder, transcript_folder, UTR5_folder, UTR3_folder, term_folder, TSS_fuzzy, term_fuzzy, min_length, statistics, operon_output_folder, combine_gff, operon_statistics_folder): self.tsss = TSS_folder self.gffs = annotation_folder self.trans = transcript_folder self.utr5s = UTR5_folder self.utr3s = UTR3_folder self.terms = term_folder self.tss_fuzzy = TSS_fuzzy self.term_fuzzy = term_fuzzy self.length = min_length self.statistics = statistics self.output_folder = operon_output_folder self.combine = combine_gff self.stat_folder = operon_statistics_folder return self def container_snp(self, samtools_path, bcftools_path, bam_type, program, fasta_path, tex_bam_path, frag_bam_path, quality, read_depth, snp_output_folder, indel_fraction, chrom): self.samtools_path = samtools_path self.bcftools_path = bcftools_path self.types = bam_type self.program = self._deal_multi_inputs(program, "str", None, None) self.fastas = fasta_path self.normal_bams = tex_bam_path self.frag_bams = frag_bam_path self.quality = quality self.depth = read_depth self.out_folder = snp_output_folder self.fraction = indel_fraction if chrom == "haploid": chrom = "1" elif chrom == "diploid": chrom = "2" self.chrom = chrom return self def container_circrna(self, align, process, fasta_path, annotation_path, tex_bam_path, fragmented_bam_path, read_folder, circrna_stat_folder, support_reads, segemehl_folder, samtools_path, start_ratio, end_ratio, ignore_hypothetical_protein, out_folder): self.align = align self.cores = process self.fastas = fasta_path self.gffs = annotation_path self.normal_bams = tex_bam_path self.frag_bams = fragmented_bam_path self.read_folder = read_folder self.stat_folder = circrna_stat_folder self.support = support_reads self.segemehl_path = segemehl_folder self.samtools_path = samtools_path self.start_ratio = start_ratio self.end_ratio = end_ratio self.hypo = ignore_hypothetical_protein self.output_folder = out_folder return self def container_ribos(self, infernal_path, riboswitch_ID, gff_path, fasta_path, tss_path, transcript_path, Rfam, ribos_output_folder, e_value, output_all, database_folder, fuzzy, start_codon, min_dist_rbs, max_dist_rbs, fuzzy_rbs, UTR_length): self.infernal_path = infernal_path self.ribos_id = riboswitch_ID self.gffs = gff_path self.fastas = fasta_path self.tsss = tss_path self.trans = transcript_path self.rfam = Rfam self.out_folder = ribos_output_folder self.e_value = e_value self.output_all = output_all self.database = database_folder self.fuzzy = fuzzy self.start_codons = self._deal_multi_inputs(start_codon, "str", None, None) self.start_rbs = min_dist_rbs self.end_rbs = max_dist_rbs self.fuzzy_rbs = fuzzy_rbs self.utr = UTR_length return self def container_screen(self, main_gff, side_gffs, fasta, frag_wig_folder, tex_wig_folder, height, tex_libs, frag_libs, present, output_folder): self.main_gff = main_gff self.side_gffs = self._deal_multi_inputs(side_gffs, "str", None, None) self.fasta = fasta self.frag_wigs = frag_wig_folder self.tex_wigs = tex_wig_folder self.height = height self.tlibs = self._deal_multi_inputs(tex_libs, "str", None, None) self.flibs = self._deal_multi_inputs(frag_libs, "str", None, None) self.present = present self.output_folder = output_folder return self
class RATT(object): def __init__(self, args_ratt): self.multiparser = Multiparser() self.converter = Converter() self.format_fixer = FormatFixer() self.helper = Helper() self.gbk = os.path.join(args_ratt.ref_embls, "gbk_tmp") self.gbk_tmp = os.path.join(self.gbk, "tmp") self.embl = os.path.join(args_ratt.ref_embls, "embls") self.ratt_log = os.path.join(args_ratt.output_path, "ratt_log.txt") self.tmp_files = {"tar": os.path.join(args_ratt.tar_fastas, "tmp"), "ref": os.path.join(args_ratt.ref_fastas, "tmp"), "out_gff": os.path.join(args_ratt.gff_outfolder, "tmp"), "gff": os.path.join(args_ratt.gff_outfolder, "tmp.gff"), "ptt": os.path.join(args_ratt.gff_outfolder, "tmp.ptt"), "rnt": os.path.join(args_ratt.gff_outfolder, "tmp.rnt")} def _convert_to_pttrnt(self, gffs, files): for gff in files: if gff.endswith(".gff"): gff = os.path.join(gffs, gff) filename = gff.split("/") prefix = filename[-1][:-4] rnt = gff[:-3] + "rnt" ptt = gff[:-3] + "ptt" fasta = self.helper.get_correct_file(self.tmp_files["tar"], ".fa", prefix, None, None) if fasta: self.converter.convert_gff2rntptt(gff, fasta, ptt, rnt, None, None) def _remove_files(self, args_ratt, out_gbk): self.helper.remove_all_content(args_ratt.gff_outfolder, ".gff", "file") self.helper.remove_all_content(args_ratt.gff_outfolder, ".ptt", "file") self.helper.remove_all_content(args_ratt.gff_outfolder, ".rnt", "file") self.helper.move_all_content(self.tmp_files["out_gff"], args_ratt.gff_outfolder, None) shutil.rmtree(self.tmp_files["out_gff"]) shutil.rmtree(self.tmp_files["tar"]) shutil.rmtree(self.tmp_files["ref"]) shutil.rmtree(self.embl) self.helper.remove_all_content(args_ratt.tar_fastas, "_folder", "dir") self.helper.remove_all_content(args_ratt.ref_fastas, "_folder", "dir") if out_gbk: shutil.rmtree(out_gbk) def _convert_to_gff(self, ratt_result, args_ratt, files): name = ratt_result.split(".") filename = ".".join(name[1:-2]) + ".gff" output_file = os.path.join(args_ratt.output_path, filename) self.converter.convert_embl2gff( os.path.join(args_ratt.output_path, ratt_result), output_file) self.format_fixer.fix_ratt(output_file, ".".join(name[1:-2]), "tmp_gff") shutil.move("tmp_gff", output_file) shutil.copy(output_file, os.path.join(args_ratt.gff_outfolder, filename)) files.append(filename) def _parser_embl_gbk(self, files): self.helper.check_make_folder(self.gbk) for file_ in files: close = False with open(file_, "r") as f_h: for line in f_h: if (line.startswith("LOCUS")): out = open(self.gbk_tmp, "w") datas = line.split(" ") for data in datas: if (len(data) != 0) and (data != "LOCUS"): filename = ".".join([data, "gbk"]) break elif (line.startswith("VERSION")): datas = line.split(" ") for data in datas: if (len(data) != 0) and (data != "VERSION"): new_filename = ".".join([data, "gbk"]) break if new_filename.find(filename): filename = new_filename if out: out.write(line) if line.startswith("//"): out.close() close = True shutil.move(self.gbk_tmp, os.path.join(self.gbk, filename)) if not close: out.close() return self.gbk def _convert_embl(self, ref_embls): detect_gbk = False gbks = [] out_gbk = None for embl in os.listdir(ref_embls): if embl.endswith(".gbk"): detect_gbk = True gbks.append(os.path.join(ref_embls, embl)) if not detect_gbk: print("Error: please assign proper folder for Genebank file!!!") sys.exit() elif detect_gbk: out_gbk = self._parser_embl_gbk(gbks) self.converter.convert_gbk2embl(out_gbk) self.helper.check_make_folder(self.embl) self.helper.move_all_content(out_gbk, self.embl, [".embl"]) return out_gbk def _run_ratt(self, args_ratt, tar, ref, out): call([args_ratt.ratt_path, self.embl, os.path.join(self.tmp_files["tar"], tar + ".fa"), args_ratt.element, args_ratt.transfer_type, os.path.join(self.tmp_files["ref"], ref + ".fa")], stdout=out, stderr=DEVNULL) def _format_and_run(self, args_ratt): print("Running RATT...") for pair in args_ratt.pairs: ref = pair.split(":")[0] tar = pair.split(":")[1] out = open(self.ratt_log, "w+") print(tar) self._run_ratt(args_ratt, tar, ref, out) for filename in os.listdir(): if ("final" in filename): shutil.move(filename, os.path.join(args_ratt.output_path, filename)) elif (args_ratt.element in filename) or ( "query" in filename) or ( "Reference" in filename) or ( "Query" in filename) or ( "Sequences" in filename): if os.path.isfile(filename): os.remove(filename) if os.path.isdir(filename): shutil.rmtree(filename) out.close() def annotation_transfer(self, args_ratt): self.multiparser.parser_fasta(args_ratt.tar_fastas) self.multiparser.parser_fasta(args_ratt.ref_fastas) out_gbk = self._convert_embl(args_ratt.ref_embls) self._format_and_run(args_ratt) if args_ratt.convert: files = [] for data in os.listdir(args_ratt.output_path): if "final.embl" in data: self._convert_to_gff(data, args_ratt, files) self._convert_to_pttrnt(args_ratt.gff_outfolder, files) self.helper.check_make_folder(self.tmp_files["out_gff"]) for folder in os.listdir(args_ratt.tar_fastas): files = [] if "_folder" in folder: datas = folder.split("_folder") prefix = datas[0][:-3] for file_ in os.listdir(os.path.join(args_ratt.tar_fastas, folder)): files.append(file_[:-3]) for gff in os.listdir(args_ratt.gff_outfolder): for file_ in files: if (".gff" in gff) and (file_ == gff[:-4]): self.helper.merge_file(os.path.join( args_ratt.gff_outfolder, gff), self.tmp_files["gff"]) if (".ptt" in gff) and (file_ == gff[:-4]): self.helper.merge_file(os.path.join( args_ratt.gff_outfolder, gff), self.tmp_files["ptt"]) if (".rnt" in gff) and (file_ == gff[:-4]): self.helper.merge_file(os.path.join( args_ratt.gff_outfolder, gff), self.tmp_files["rnt"]) shutil.move(self.tmp_files["gff"], os.path.join( self.tmp_files["out_gff"], prefix + ".gff")) shutil.move(self.tmp_files["ptt"], os.path.join( self.tmp_files["out_gff"], prefix + ".ptt")) shutil.move(self.tmp_files["rnt"], os.path.join( self.tmp_files["out_gff"], prefix + ".rnt")) self._remove_files(args_ratt, out_gbk)
class SNPCalling(object): def __init__(self, args_snp): self.multiparser = Multiparser() self.seq_editer = SeqEditer() self.helper = Helper() if args_snp.types == "reference": file_type = "compare_reference" else: file_type = "validate_target" self.seq_path = os.path.join(args_snp.out_folder, file_type, "seqs") self.stat_path = os.path.join(args_snp.out_folder, file_type, "statistics") self.fasta_path = os.path.join(args_snp.fastas, "tmp") self.outputs = {"table": os.path.join( args_snp.out_folder, file_type, "SNP_table"), "raw": os.path.join( args_snp.out_folder, file_type, "SNP_raw_outputs"), "tmp": os.path.join(args_snp.out_folder, "tmp_bcf")} if "whole_reads.bam" in os.listdir(args_snp.out_folder): self.helper.remove_all_content(args_snp.out_folder, "whole_read", "file") self.bams = {"whole": os.path.join(args_snp.out_folder, "whole_reads.bam"), "sort": os.path.join(args_snp.out_folder, "whole_reads_sorted.bam")} self.header = os.path.join(args_snp.out_folder, "header") self.baqs = {"with": "with_BAQ", "without": "without_BAQ", "extend": "extend_BAQ"} def _import_bam(self, bam_folder, bams): num_bam = 0 for bam in os.listdir(bam_folder): if bam.endswith(".bam"): num_bam += 1 bams.append(os.path.join(bam_folder, bam)) return num_bam def _transcript_snp(self, fasta, snp, out_table_prefix, type_, prefix, bam_number, table_path, args_snp): seq_path = os.path.join(self.seq_path, self.baqs[type_], prefix) stat_file = os.path.join(self.stat_path, "_".join([ "stat", "_".join([prefix, self.baqs[type_]]), "SNP.csv"])) snp_detect(fasta, snp, out_table_prefix, os.path.join(seq_path, prefix), bam_number, stat_file, args_snp) self.helper.move_all_content(table_path, self.stat_path, [".png"]) def _run_tools(self, fasta_file, out_bcf, out_raw_prefix, type_, args_snp): if type_ == "with": call([args_snp.samtools_path, "mpileup", "-t", "DP", "-ugf", fasta_file, self.bams["sort"], "--ignore-RG"], stdout=out_bcf) elif type_ == "without": call([args_snp.samtools_path, "mpileup", "-t", "DP", "-B", "-ugf", fasta_file, self.bams["sort"], "--ignore-RG"], stdout=out_bcf) elif type_ == "extend": call([args_snp.samtools_path, "mpileup", "-t", "DP", "-E", "-ugf", fasta_file, self.bams["sort"], "--ignore-RG"], stdout=out_bcf) out_vcf = "_".join([out_raw_prefix, self.baqs[type_] + ".vcf"]) if args_snp.chrom == "1": call([args_snp.bcftools_path, "call", "--ploidy", args_snp.chrom, self.outputs["tmp"], "-vmO", "v", "-o", out_vcf]) elif args_snp.chrom == "2": call([args_snp.bcftools_path, "call", self.outputs["tmp"], "-vmO", "v", "-o", out_vcf]) return out_vcf def _run_sub(self, args_snp, fasta_file, type_, file_prefixs, prefix, table_path, bam_number): out_bcf = open(self.outputs["tmp"], "w") out_vcf = self._run_tools(fasta_file, out_bcf, file_prefixs["raw_prefix"], type_, args_snp) self.helper.check_make_folder( os.path.join(self.seq_path, self.baqs[type_], prefix)) self._transcript_snp( fasta_file, out_vcf, "_".join([file_prefixs["table_prefix"], self.baqs[type_]]), type_, prefix, bam_number, table_path, args_snp) out_bcf.close() def _run_program(self, fasta_file, file_prefixs, prefix, bam_number, table_path, args_snp): for index in args_snp.program: if index == "1": type_ = "with" print("Running SNP calling with BAQ...") elif index == "2": type_ = "without" print("Running SNP calling without BAQ...") elif index == "3": print("Running SNP calling extend BAQ...") type_ = "extend" else: print("Error: No correct program, please assign 1, 2, 3") sys.exit() self._run_sub(args_snp, fasta_file, type_, file_prefixs, prefix, table_path, bam_number) def _detect_fasta(self, fasta): detect = False if fasta.endswith(".fa"): prefix = fasta[:-3] detect = True elif fasta.endswith(".fna"): prefix = fasta[:-4] detect = True elif fasta.endswith(".fasta"): prefix = fasta[:-6] detect = True return (detect, prefix) def _run_bam(self, samtools_path, sub_command, bam_file): if sub_command == "merge": command = (" ".join([samtools_path, sub_command, self.bams["whole"], bam_file])) elif sub_command == "sort": command = (" ".join([samtools_path, sub_command, "-o", bam_file, self.bams["whole"]])) os.system(command) def _merge_bams(self, args_snp): bams = [] num_normal = 0 num_frag = 0 if (args_snp.frag_bams is None) and (args_snp.normal_bams is None): print("Error: There is no BAMs folders!!") sys.exit() else: if args_snp.normal_bams is not None: num_normal = self._import_bam(args_snp.normal_bams, bams) if args_snp.frag_bams is not None: num_frag = self._import_bam(args_snp.frag_bams, bams) num_bam = num_normal + num_frag if num_bam <= 1: shutil.copyfile(bams[0], self.bams["whole"]) print("Sort BAM file now ...") self._run_bam(args_snp.samtools_path, "sort", self.bams["sort"]) else: print("Merge BAM files now ...") self._run_bam(args_snp.samtools_path, "merge", " ".join(bams)) print("Sort BAM file now ...") self._run_bam(args_snp.samtools_path, "sort", self.bams["sort"]) return num_bam def _modify_header(self, fastas): for fasta in os.listdir(fastas): if fasta.endswith("fasta") or \ fasta.endswith("fa") or \ fasta.endswith("fna"): self.seq_editer.modify_header(os.path.join(fastas, fasta)) def _get_header(self, samtools_path): command = " ".join([samtools_path, "view", "-H", self.bams["sort"]]) os.system(">".join([command, self.header])) def _get_genome_name(self, samtools_path): self._get_header(samtools_path) fh = open(self.header, "r") seq_names = [] for row in csv.reader(fh, delimiter="\t"): if row[0] == "@SQ": seq_names.append(row[1].split(":")[1]) fh.close() return seq_names def run_snp_calling(self, args_snp): self.multiparser.parser_fasta(args_snp.fastas) self._modify_header(args_snp.fastas) bam_number = self._merge_bams(args_snp) seq_names = self._get_genome_name(args_snp.samtools_path) if ("1" not in args_snp.program) and ( "2" not in args_snp.program) and ( "3" not in args_snp.program): print("Error:Please assign a correct BAQ type: " "'1' means 'with_BAQ', '2' means 'with_BAQ' or " "'3' means 'extend_BAQ'.") sys.exit() else: for fasta in os.listdir(self.fasta_path): if (fasta.split(".f")[0] in seq_names): fasta_datas = self._detect_fasta(fasta) detect = fasta_datas[0] prefix = fasta_datas[1] if detect: detect = False print("Computing {0} now ...".format(fasta)) self.helper.check_make_folder( os.path.join(self.outputs["table"], prefix)) self.helper.check_make_folder( os.path.join(self.outputs["raw"], prefix)) file_prefixs = {"raw_prefix": os.path.join( self.outputs["raw"], prefix, prefix), "table_prefix": os.path.join( self.outputs["table"], prefix, prefix)} fasta_file = os.path.join(self.fasta_path, fasta) table_path = os.path.join(self.outputs["table"], prefix) self._run_program(fasta_file, file_prefixs, prefix, bam_number, table_path, args_snp) os.remove(self.outputs["tmp"]) self.helper.remove_tmp(args_snp.fastas) os.remove(self.bams["whole"]) os.remove(self.bams["sort"]) os.remove(self.header)
class MEME(object): '''detection of promoter''' def __init__(self, args_pro): self.multiparser = Multiparser() self.helper = Helper() self.tss_path = os.path.join(args_pro.tsss, "tmp") if args_pro.gffs is not None: self.gff_path = os.path.join(args_pro.gffs, "tmp") else: self.gff_path = None self.out_fasta = os.path.join(args_pro.output_folder, "fasta_classes") self.tmp_folder = os.path.join(os.getcwd(), "tmp") self.fastas = {"pri": os.path.join(self.tmp_folder, "primary.fa"), "sec": os.path.join(self.tmp_folder, "secondary.fa"), "inter": os.path.join(self.tmp_folder, "internal.fa"), "anti": os.path.join(self.tmp_folder, "antisense.fa"), "orph": os.path.join(self.tmp_folder, "orphan.fa"), "all_no_orph": "without_orphan.fa", "all": "all_type.fa", "tmp_fa": os.path.join(self.tmp_folder, "tmp.fa"), "tmp_all": os.path.join(self.tmp_folder, "tmp_all.fa")} self.all_fasta = os.path.join(args_pro.fastas, "allfasta.fa") self.all_tss = os.path.join(self.tss_path, "allfasta_TSS.gff") def _gen_and_check_folder(self, out_path, folder, type_): sub_out_folder = os.path.join(out_path, type_) if folder in os.listdir(sub_out_folder): shutil.rmtree(os.path.join(sub_out_folder, folder)) return sub_out_folder def _run_normal_motif(self, input_path, out_path, filename, fasta, width, args_pro, log): '''run MEME with specific width''' folder = "_".join(["promoter_motifs", filename, str(width), "nt"]) if (args_pro.program.lower() == "meme") or ( args_pro.program.lower() == "both"): meme_folder = self._gen_and_check_folder( out_path, folder, "MEME") command = [args_pro.meme_path, "-maxsize", "1000000", "-dna", "-nmotifs", str(args_pro.num_motif), "-w", str(width), "-maxiter", "100", "-evt", str(args_pro.e_value)] if args_pro.para is not None: command = command + ["-p", args_pro.para] log.write(" ".join(command + ["-oc", os.path.join( meme_folder, folder), os.path.join(input_path, fasta)]) + "\n") call(command + ["-oc", os.path.join(meme_folder, folder), os.path.join(input_path, fasta)]) if (args_pro.program.lower() == "glam2") or ( args_pro.program.lower() == "both"): glam_folder = self._gen_and_check_folder( out_path, folder, "GLAM2") log.write(" ".join([args_pro.glam2_path, "-O", os.path.join(glam_folder, folder), "-w", str(width), "-b", str(width), "-r", str(args_pro.num_motif), "-n", str(args_pro.end_run), "n", os.path.join(input_path, fasta)]) + "\n") call([args_pro.glam2_path, "-O", os.path.join(glam_folder, folder), "-w", str(width), "-b", str(width), "-r", str(args_pro.num_motif), "-n", str(args_pro.end_run), "n", os.path.join(input_path, fasta)]) def _run_small_motif(self, input_path, out_path, filename, fasta, width, args_pro, log): '''run MEME with range of width''' data = width.split("-") min_width = data[0] max_width = data[1] folder = "_".join(["promoter_motifs", filename, "-".join([str(min_width), str(max_width)]), "nt"]) if (args_pro.program.lower() == "meme") or ( args_pro.program.lower() == "both"): meme_folder = self._gen_and_check_folder( out_path, folder, "MEME") command = [args_pro.meme_path, "-maxsize", "1000000", "-dna", "-nmotifs", str(args_pro.num_motif), "-minsites", "0", "-maxsites", "2", "-minw", str(min_width), "-maxw", str(max_width), "-maxiter", "100", "-evt", str(args_pro.e_value)] if args_pro.para is not None: command = command + ["-p", args_pro.para] log.write(" ".join(command + ["-oc", os.path.join( meme_folder, folder), os.path.join(input_path, fasta)]) + "\n") call(command + ["-oc", os.path.join(meme_folder, folder), os.path.join(input_path, fasta)]) if (args_pro.program.lower() == "glam2") or ( args_pro.program.lower() == "both"): glam_folder = self._gen_and_check_folder( out_path, folder, "GLAM2") log.write(" ".join([args_pro.glam2_path, "-O", os.path.join(glam_folder, folder), "-a", str(min_width), "-b", str(max_width), "-r", str(args_pro.num_motif), "-n", str(args_pro.end_run), "n", os.path.join(input_path, fasta)]) + "\n") call([args_pro.glam2_path, "-O", os.path.join(glam_folder, folder), "-a", str(min_width), "-b", str(max_width), "-r", str(args_pro.num_motif), "-n", str(args_pro.end_run), "n", os.path.join(input_path, fasta)]) def _get_fasta_file(self, fasta_path, prefix): for fasta in os.listdir(fasta_path): if (fasta.endswith(".fa")) and \ (prefix == fasta.replace(".fa", "")): break elif (fasta.endswith(".fna")) and \ (prefix == fasta.replace(".fna", "")): break elif (fasta.endswith(".fasta")) and \ (prefix == fasta.replace(".fasta", "")): break return fasta def _check_gff(self, gffs): for gff in os.listdir(gffs): if gff.endswith(".gff"): self.helper.check_uni_attributes(os.path.join(gffs, gff)) def _move_and_merge_fasta(self, input_path, prefix): all_type = os.path.join(self.tmp_folder, self.fastas["all"]) all_no_orph = os.path.join(self.tmp_folder, self.fastas["all_no_orph"]) if self.fastas["all"] in os.listdir(self.tmp_folder): os.remove(all_type) if self.fastas["all_no_orph"] in os.listdir(self.tmp_folder): os.remove(all_no_orph) shutil.copyfile(self.fastas["pri"], self.fastas["tmp_fa"]) self.helper.merge_file(self.fastas["sec"], self.fastas["tmp_fa"]) self.helper.merge_file(self.fastas["inter"], self.fastas["tmp_fa"]) self.helper.merge_file(self.fastas["anti"], self.fastas["tmp_fa"]) shutil.copyfile(self.fastas["tmp_fa"], self.fastas["tmp_all"]) self.helper.merge_file(self.fastas["orph"], self.fastas["tmp_all"]) del_repeat_fasta(self.fastas["tmp_fa"], all_no_orph) del_repeat_fasta(self.fastas["tmp_all"], all_type) os.remove(self.fastas["tmp_fa"]) os.remove(self.fastas["tmp_all"]) out_prefix = os.path.join(input_path, prefix) shutil.move(self.fastas["pri"], "_".join([ out_prefix, "allgenome_primary.fa"])) shutil.move(self.fastas["sec"], "_".join([ out_prefix, "allgenome_secondary.fa"])) shutil.move(self.fastas["inter"], "_".join([ out_prefix, "allgenome_internal.fa"])) shutil.move(self.fastas["anti"], "_".join([ out_prefix, "allgenome_antisense.fa"])) shutil.move(self.fastas["orph"], "_".join([ out_prefix, "allgenome_orphan.fa"])) shutil.move(all_type, "_".join([ out_prefix, "allgenome_all_types.fa"])) shutil.move(all_no_orph, "_".join([ out_prefix, "allgenome_without_orphan.fa"])) def _split_fasta_by_strain(self, input_path): for fasta in os.listdir(input_path): if "allgenome" not in fasta: os.remove(os.path.join(input_path, fasta)) out = None for fasta in os.listdir(input_path): if fasta.endswith(".fa"): pre_strain = "" num_strain = 0 with open(os.path.join(input_path, fasta), "r") as f_h: for line in f_h: line = line.strip() if line.startswith(">"): datas = line.split("_") strain = "_".join(datas[2:]) if pre_strain != strain: num_strain += 1 filename = fasta.split("allgenome") if out is not None: out.close() out = open(os.path.join( input_path, "".join([ filename[0], strain, filename[-1]])), "a") pre_strain = strain out.write(line + "\n") else: out.write(line + "\n") if num_strain <= 1: os.remove(os.path.join(input_path, "".join([filename[0], strain, filename[-1]]))) out.close() def _run_program(self, prefixs, args_pro, log, input_fastas): log.write("Using MEME or GLAM2 to predict promoter.\n") log.write("Please make sure their versions are at least 4.11.1.\n") log.write("If you are running for parallel, please make sure you " "have install MPICH and its version is at least 3.2.\n") for prefix in prefixs: input_path = os.path.join(self.out_fasta, prefix) out_path = os.path.join(args_pro.output_folder, prefix) if args_pro.program.lower() == "both": self.helper.check_make_folder(os.path.join(out_path, "MEME")) self.helper.check_make_folder(os.path.join(out_path, "GLAM2")) elif args_pro.program.lower() == "meme": self.helper.check_make_folder(os.path.join(out_path, "MEME")) elif args_pro.program.lower() == "glam2": self.helper.check_make_folder(os.path.join(out_path, "GLAM2")) for fasta in os.listdir(input_path): filename = fasta.replace(".fa", "") names = filename.split("_") if (names[-1] in input_fastas) or ( ("_".join(names[-2:]) == "all_types") and ( "all_types" in input_fastas)) or ( ("_".join(names[-2:]) == "without_orphan") and ( "without_orphan" in input_fastas)): for width in args_pro.widths: print("Computing promoters of {0} - {1}".format( fasta, width)) log.write("Computing promoters of {0} - length {1}.\n".format( fasta, width)) if "-" in width: self._run_small_motif(input_path, out_path, filename, fasta, width, args_pro, log) else: self._run_normal_motif(input_path, out_path, filename, fasta, width, args_pro, log) log.write("Promoter search for {0} is done.\n".format(prefix)) log.write("All the output files from MEME or GLAM2 are generated " "and stored in {0}.\n".format(out_path)) def _combine_file(self, prefixs, args_pro): '''combine all TSS file in the input folder to generate the global TSS for detecting the global promoter''' if args_pro.source: for tss in os.listdir(self.tss_path): if tss.endswith("_TSS.gff"): self.helper.merge_file(os.path.join( self.tss_path, tss), self.all_tss) for fasta in os.listdir(args_pro.fastas): if (fasta.endswith(".fa")) or ( fasta.endswith(".fna")) or ( fasta.endswith(".fasta")): self.helper.merge_file(os.path.join( args_pro.fastas, fasta), self.all_fasta) else: for tss in os.listdir(os.path.join( args_pro.output_folder, "TSS_classes")): if tss.endswith("_TSS.gff"): self.helper.merge_file(os.path.join( self.tss_path, tss), self.all_tss) for fasta in os.listdir(args_pro.fastas): if (fasta.endswith(".fa")) or ( fasta.endswith(".fna")) or ( fasta.endswith(".fasta")): self.helper.merge_file(os.path.join( args_pro.fastas, fasta), self.all_fasta) print("Generating fasta file of all sequences") prefixs.append("allfasta") input_path = os.path.join(self.out_fasta, "allfasta") self.helper.check_make_folder(os.path.join( args_pro.output_folder, "allfasta")) self.helper.check_make_folder(os.path.join( self.out_fasta, "allfasta")) args_pro.source = True upstream(self.all_tss, self.all_fasta, None, None, args_pro, None) self._move_and_merge_fasta(input_path, "allfasta") def _remove_files(self, args_pro): self.helper.remove_tmp_dir(args_pro.fastas) self.helper.remove_tmp_dir(args_pro.tsss) self.helper.remove_tmp_dir(args_pro.gffs) if "tmp_wig" in os.listdir(args_pro.output_folder): shutil.rmtree(os.path.join(args_pro.output_folder, "tmp_wig")) if "allfasta" in os.listdir(os.getcwd()): shutil.rmtree("allfasta") if "tmp" in os.listdir(os.getcwd()): shutil.rmtree("tmp") def _gen_table(self, output_folder, prefixs, combine, program, log): '''generate the promoter table''' log.write("Running gen_promoter_table.py to generate promoter " "table which is useful for sRNA prediction.\n") log.write("The following files are generated:\n") if combine: strains = prefixs + ["allfasta"] else: strains = prefixs for strain in strains: tss_file = os.path.join(self.tss_path, strain + "_TSS.gff") if (program.lower() == "both") or ( program.lower() == "meme"): for folder in os.listdir(os.path.join(output_folder, strain, "MEME")): csv_file = os.path.join(output_folder, strain, "MEME", folder, "meme.csv") gen_promoter_table(os.path.join(output_folder, strain, "MEME", folder, "meme.txt"), csv_file, tss_file, "meme") log.write("\t" + csv_file + "\n") if (program.lower() == "both") or ( program.lower() == "glam2"): for folder in os.listdir(os.path.join(output_folder, strain, "GLAM2")): csv_file = os.path.join(output_folder, strain, "GLAM2", folder, "glam2.csv") gen_promoter_table(os.path.join(output_folder, strain, "GLAM2", folder, "glam2.txt"), csv_file, tss_file, "glam2") log.write("\t" + csv_file + "\n") def _get_upstream(self, args_pro, prefix, tss, fasta): '''get upstream sequence of TSS''' if args_pro.source: print("Generating fasta file of {0}".format(prefix)) upstream(os.path.join(self.tss_path, tss), os.path.join(args_pro.fastas, fasta), None, None, args_pro, prefix) else: if (args_pro.gffs is None): print("Error: Please assign proper annotation!!!") sys.exit() if "TSS_classes" not in os.listdir(args_pro.output_folder): os.mkdir(os.path.join(args_pro.output_folder, "TSS_classes")) print("Classifying TSSs and extracting sequence of {0}".format(prefix)) upstream(os.path.join(self.tss_path, tss), os.path.join(args_pro.fastas, fasta), os.path.join(self.gff_path, prefix + ".gff"), os.path.join(args_pro.output_folder, "TSS_classes", "_".join([prefix, "TSS.gff"])), args_pro, prefix) def _get_used_tss_type(self, args_pro): input_fastas = [] for tss in args_pro.use_tss: if int(tss) == 1: input_fastas.append("all_types") elif int(tss) == 2: input_fastas.append("primary") elif int(tss) == 3: input_fastas.append("secondary") elif int(tss) == 4: input_fastas.append("internal") elif int(tss) == 5: input_fastas.append("antisense") elif int(tss) == 6: input_fastas.append("orphan") elif int(tss) == 7: input_fastas.append("without_orphan") else: print("Error: The assignment of --use_tss_typ is wrong!") sys.exit() return input_fastas def run_meme(self, args_pro, log): if "allfasta.fa" in os.listdir(args_pro.fastas): os.remove(self.all_fasta) if "allfasta.fa_folder" in os.listdir(args_pro.fastas): shutil.rmtree(os.path.join(args_pro.fastas, "allfasta.fa_folder")) self.multiparser.parser_fasta(args_pro.fastas) self.multiparser.parser_gff(args_pro.tsss, "TSS") if "allfasta_TSS.gff" in os.listdir(self.tss_path): os.remove(self.all_tss) if args_pro.gffs is not None: self._check_gff(args_pro.gffs) self.multiparser.parser_gff(args_pro.gffs, None) self.multiparser.combine_gff(args_pro.fastas, self.gff_path, "fasta", None) self._check_gff(args_pro.tsss) self.multiparser.combine_gff(args_pro.fastas, self.tss_path, "fasta", "TSS") self.helper.check_make_folder(self.out_fasta) self.helper.check_make_folder(self.tmp_folder) prefixs = [] log.write("Running .TSS_upstream.py to extract the upstream " "sequences of TSSs.\n") log.write("The following files are generated:\n") for tss in os.listdir(self.tss_path): prefix = tss.replace("_TSS.gff", "") prefixs.append(prefix) self.helper.check_make_folder(os.path.join(args_pro.output_folder, prefix)) self.helper.check_make_folder(os.path.join(self.out_fasta, prefix)) input_path = os.path.join(self.out_fasta, prefix) fasta = self._get_fasta_file(args_pro.fastas, prefix) self._get_upstream(args_pro, prefix, tss, fasta) self._move_and_merge_fasta(input_path, prefix) self._split_fasta_by_strain(input_path) for file_ in os.listdir(input_path): log.write("\t" + os.path.join(input_path, file_) + "\n") if args_pro.combine: self._combine_file(prefixs, args_pro) for file_ in os.listdir(os.path.join(self.out_fasta, "allfasta")): log.write("\t" + os.path.join( self.out_fasta, "allfasta", file_) + "\n") input_fastas = self._get_used_tss_type(args_pro) self._run_program(prefixs, args_pro, log, input_fastas) print("Generating the tables") self._gen_table(args_pro.output_folder, prefixs, args_pro.combine, args_pro.program, log) self._remove_files(args_pro)
class sRNADetection(object): def __init__(self, args_srna): self.args_container = ArgsContainer() self.helper = Helper() self.multiparser = Multiparser() self.gff_output = os.path.join(args_srna.out_folder, "gffs") self.table_output = os.path.join(args_srna.out_folder, "tables") self.stat_path = os.path.join(args_srna.out_folder, "statistics") self.tss_path = self._check_folder_exist(args_srna.tss_folder) self.pro_path = self._check_folder_exist(args_srna.pro_folder) self.sorf_path = self._check_folder_exist(args_srna.sorf_file) self.fasta_path = os.path.join(args_srna.fastas, "tmp") self.tran_path = os.path.join(args_srna.trans, "tmp") self.term_path = self._check_folder_exist(args_srna.terms) self.merge_wigs = os.path.join(args_srna.out_folder, "merge_wigs") self.prefixs = {"merge": os.path.join( args_srna.out_folder, "tmp_merge"), "utr": os.path.join( args_srna.out_folder, "tmp_utrsrna"), "normal": os.path.join( args_srna.out_folder, "tmp_normal"), "in_cds": os.path.join( args_srna.out_folder, "tmp_incds"), "merge_table": os.path.join( args_srna.out_folder, "tmp_merge_table"), "utr_table": os.path.join( args_srna.out_folder, "tmp_utrsrna_table"), "normal_table": os.path.join( args_srna.out_folder, "tmp_normal_table"), "in_cds_table": os.path.join( args_srna.out_folder, "tmp_incds_table"), "basic": os.path.join( args_srna.out_folder, "tmp_basic"), "energy": os.path.join( args_srna.out_folder, "tmp_energy")} self.tmps = {"nr": os.path.join(args_srna.out_folder, "tmp_nr"), "srna": os.path.join(args_srna.out_folder, "tmp_sRNA")} self.best_table = os.path.join(self.table_output, "best") self.table_output = os.path.join(args_srna.out_folder, "tables") self.stat_path = os.path.join(args_srna.out_folder, "statistics") self.all_best = {"all_gff": os.path.join( self.gff_output, "all_candidates"), "best_gff": os.path.join(self.gff_output, "best"), "all_table": os.path.join( self.table_output, "all_candidates"), "best_table": os.path.join(self.table_output, "best")} def _check_folder_exist(self, folder): if folder is not None: path = os.path.join(folder, "tmp") else: path = None return path def _check_gff(self, gffs): for gff in os.listdir(gffs): if gff.endswith(".gff"): self.helper.check_uni_attributes(os.path.join(gffs, gff)) def _run_format(self, blast_path, database, type_, db_file, err): call([os.path.join(blast_path, "makeblastdb"), "-in", database, "-dbtype", type_, "-out", db_file], stderr=err) def _formatdb(self, database, type_, out_folder, blast_path, database_type): err = open(os.path.join(out_folder, "log.txt"), "w") if (database.endswith(".fa")) or ( database.endswith(".fna")) or ( database.endswith(".fasta")): pass else: folders = database.split("/") filename = folders[-1] folder = "/".join(folders[:-1]) for fasta in os.listdir(folder): if (fasta.endswith(".fa")) or ( fasta.endswith(".fna")) or ( fasta.endswith(".fasta")): if ".".join(fasta.split(".")[:-1]) == filename: database = os.path.join(folder, fasta) if database_type == "sRNA": change_format(database, "tmp_srna_database") os.remove(database) shutil.move("tmp_srna_database", database) db_file = ".".join(database.split(".")[:-1]) self._run_format(blast_path, database, type_, db_file, err) err.close() def _merge_frag_tex_file(self, files, args_srna): if (args_srna.frag_wigs is not None) and ( args_srna.tex_wigs is not None): self.helper.merge_file(files["frag_gff"], files["tex_gff"]) self.helper.merge_file(files["frag_csv"], files["tex_csv"]) shutil.move(files["tex_csv"], files["merge_csv"]) self.helper.sort_gff(files["tex_gff"], files["merge_gff"]) os.remove(files["frag_csv"]) os.remove(files["frag_gff"]) os.remove(files["tex_gff"]) elif (args_srna.frag_wigs is not None): shutil.move(files["frag_csv"], files["merge_csv"]) self.helper.sort_gff(files["frag_gff"], files["merge_gff"]) os.remove(files["frag_gff"]) elif (args_srna.tex_wigs is not None): shutil.move(files["tex_csv"], files["merge_csv"]) self.helper.sort_gff(files["tex_gff"], files["merge_gff"]) def _run_normal(self, prefix, gff, tran, fuzzy_tss, args_srna): if "tmp_cutoff_inter" in os.listdir(args_srna.out_folder): os.remove(os.path.join(args_srna.out_folder, "tmp_cutoff_inter")) files = {"frag_gff": None, "frag_csv": None, "tex_gff": None, "tex_csv": None, "merge_gff": None, "merge_csv": None} if ("tss" in args_srna.import_info): tss = self.helper.get_correct_file(self.tss_path, "_TSS.gff", prefix, None, None) else: tss = None if self.pro_path is not None: pro = self.helper.get_correct_file( self.pro_path, "_processing.gff", prefix, None, None) else: pro = None if args_srna.frag_wigs is not None: files["frag_gff"] = os.path.join( args_srna.out_folder, "_".join(["tmp_frag", prefix])) files["frag_csv"] = os.path.join( args_srna.out_folder, "_".join(["tmp_frag_table", prefix])) args_srna = self.args_container.container_intersrna( "frag", files, args_srna, prefix, os.path.join(args_srna.gffs, gff), tran, tss, pro, fuzzy_tss) intergenic_srna(args_srna) if args_srna.tex_wigs is not None: files["tex_gff"] = os.path.join( args_srna.out_folder, "_".join(["tmp_tex", prefix])) files["tex_csv"] = os.path.join( args_srna.out_folder, "_".join(["tmp_tex_table", prefix])) args_srna = self.args_container.container_intersrna( "tex", files, args_srna, prefix, os.path.join(args_srna.gffs, gff), tran, tss, pro, fuzzy_tss) intergenic_srna(args_srna) files["merge_csv"] = "_".join([self.prefixs["normal_table"], prefix]) files["merge_gff"] = "_".join([self.prefixs["normal"], prefix]) self._merge_frag_tex_file(files, args_srna) if "TSS_class" in os.listdir(args_srna.out_folder): tss = os.path.join(args_srna.out_folder, "TSS_class", prefix + "_TSS.gff") return tss def _run_utrsrna(self, gff, tran, prefix, tss, pro, args_srna): if "tmp_median" in os.listdir(args_srna.out_folder): os.remove(os.path.join(args_srna.out_folder, "tmp_median")) files = {"frag_gff": None, "frag_csv": None, "tex_gff": None, "tex_csv": None, "merge_gff": None, "merge_csv": None} if args_srna.tex_wigs is not None: files["tex_gff"] = os.path.join( args_srna.out_folder, "_".join(["tmp_utr_tex", prefix])) files["tex_csv"] = os.path.join( args_srna.out_folder, "_".join(["tmp_utr_tex_table", prefix])) args_srna = self.args_container.container_utrsrna( os.path.join(args_srna.gffs, gff), tran, tss, files, pro, os.path.join(self.fasta_path, prefix + ".fa"), "tex", prefix, args_srna) utr_derived_srna(args_srna) if args_srna.frag_wigs is not None: files["frag_gff"] = os.path.join( args_srna.out_folder, "_".join(["tmp_utr_frag", prefix])) files["frag_csv"] = os.path.join( args_srna.out_folder, "_".join(["tmp_utr_frag_table", prefix])) args_srna = self.args_container.container_utrsrna( os.path.join(args_srna.gffs, gff), tran, tss, files, pro, os.path.join(self.fasta_path, prefix + ".fa"), "frag", prefix, args_srna) utr_derived_srna(args_srna) files["merge_csv"] = "_".join([self.prefixs["utr_table"], prefix]) files["merge_gff"] = "_".join([self.prefixs["utr"], prefix]) self._merge_frag_tex_file(files, args_srna) filter_utr(files["merge_gff"], files["merge_csv"], args_srna.min_utr) def _check_necessary_file(self, args_srna): if (args_srna.gffs is None) or (args_srna.trans is None) or ( (args_srna.tex_wigs is None) and ( args_srna.frag_wigs is None)): print("Error: lack required files!!!!") sys.exit() if args_srna.utr_srna: if (args_srna.tss_folder is None): print("Error: lack required TSS files for UTR " "derived sRNA detection!!!!") sys.exit() if (args_srna.pro_folder is None): print("Warning: lack Processing site files for UTR " "derived sRNA detection!!!") print("it may effect the results!!!!") self._check_gff(args_srna.gffs) self._check_gff(args_srna.trans) if args_srna.tss_folder is not None: self._check_gff(args_srna.tss_folder) self.multiparser.parser_gff(args_srna.tss_folder, "TSS") self.multiparser.combine_gff(args_srna.gffs, self.tss_path, None, "TSS") if args_srna.pro_folder is not None: self._check_gff(args_srna.pro_folder) self.multiparser.parser_gff(args_srna.pro_folder, "processing") self.multiparser.combine_gff(args_srna.gffs, self.pro_path, None, "processing") if args_srna.sorf_file is not None: self._check_gff(args_srna.sorf_file) self.multiparser.parser_gff(args_srna.sorf_file, "sORF") self.multiparser.combine_gff(args_srna.gffs, self.sorf_path, None, "sORF") if args_srna.utr_srna or ("sec_str" in args_srna.import_info) or ( "blast_nr" in args_srna.import_info) or ( "blast_srna" in args_srna.import_info): if args_srna.fastas is None: print("Error: lack required fasta files for UTR " "derived sRNA detection!!!!") sys.exit() self.multiparser.parser_fasta(args_srna.fastas) self.multiparser.combine_fasta(args_srna.gffs, self.fasta_path, None) if args_srna.terms is not None: self._check_gff(args_srna.terms) self.multiparser.parser_gff(args_srna.terms, "term") self.multiparser.combine_gff(args_srna.gffs, self.term_path, None, "term") else: self.term_path = None def _run_program(self, args_srna): prefixs = [] tss = None for gff in os.listdir(args_srna.gffs): if gff.endswith(".gff"): prefix = gff.replace(".gff", "") prefixs.append(prefix) print("Running sRNA detection of {0}....".format(prefix)) tran = self.helper.get_correct_file( self.tran_path, "_transcript.gff", prefix, None, None) gffs = {"merge": "_".join([self.prefixs["merge"], prefix]), "utr": "_".join([self.prefixs["utr"], prefix]), "normal": "_".join([self.prefixs["normal"], prefix])} csvs = {"merge": "_".join([ self.prefixs["merge_table"], prefix]), "utr": "_".join([self.prefixs["utr_table"], prefix]), "normal": "_".join([ self.prefixs["normal_table"], prefix])} tss = self._run_normal( prefix, gff, tran, args_srna.fuzzy_tsss["inter"], args_srna) if args_srna.utr_srna: print("Running UTR derived sRNA detection of {0}".format( prefix)) if tss is None: tss = self.helper.get_correct_file( self.tss_path, "_TSS.gff", prefix, None, None) if self.pro_path is not None: pro = self.helper.get_correct_file( self.pro_path, "_processing.gff", prefix, None, None) else: pro = None if tss is not None: self._run_utrsrna(gff, tran, prefix, tss, pro, args_srna) self._merge_srna(args_srna, gffs, csvs, prefix, os.path.join(args_srna.gffs, gff), tss) filter_frag(csvs["merge"], gffs["merge"]) self.helper.sort_gff(gffs["merge"], "_".join([self.prefixs["basic"], prefix])) return prefixs def _merge_srna(self, args_srna, gffs, csvs, prefix, gff_file, tss): print("merging data of intergenic and UTR_derived sRNA...") merge_srna_gff(gffs, args_srna.in_cds, args_srna.cutoff_overlap, gff_file) merge_srna_table(gffs["merge"], csvs, os.path.join(args_srna.wig_path, "_".join([prefix, "forward.wig"])), os.path.join(args_srna.wig_path, "_".join([prefix, "reverse.wig"])), tss, args_srna) def _run_RNAfold(self, seq_file, vienna_path, sec_file): os.system(" ".join(["cat", seq_file, "|", os.path.join(vienna_path, "RNAfold"), "-p", ">", sec_file])) def _get_seq_sec(self, fasta_path, out_folder, prefix, sec_path, dot_path, vienna_path): detect = False for fasta in os.listdir(fasta_path): if fasta.endswith(".fa") and ( fasta.replace(".fa", "") == prefix): detect = True break if detect: detect = False seq_file = os.path.join(out_folder, "_".join(["sRNA_seq", prefix])) sec_file = os.path.join(out_folder, "_".join(["sRNA_2d", prefix])) self.helper.get_seq("_".join([self.prefixs["basic"], prefix]), os.path.join(fasta_path, fasta), seq_file) else: print("Error:There is not fasta file of {0}".format(prefix)) print("please check your imported information") sys.exit() tmp_path = os.path.join(out_folder, "tmp_srna") self.helper.check_make_folder(tmp_path) main_path = os.getcwd() os.chdir(tmp_path) sec_file = os.path.join(main_path, sec_file) seq_file = os.path.join(main_path, seq_file) tmp_sec_path = os.path.join(main_path, sec_path) tmp_dot_path = os.path.join(main_path, dot_path) self._run_RNAfold(seq_file, vienna_path, sec_file) extract_energy(os.path.join(main_path, "_".join([self.prefixs["basic"], prefix])), sec_file, os.path.join(main_path, "_".join([self.prefixs["energy"], prefix]))) for ps in os.listdir(os.getcwd()): new_ps = ps.replace("|", "_") shutil.move(ps, new_ps) return {"sec": tmp_sec_path, "dot": tmp_dot_path, "main": main_path, "tmp": os.path.join(main_path, tmp_path)} def _run_replot(self, vienna_util, tmp_paths, file_, dot_file, rel_file): os.system(" ".join([os.path.join(vienna_util, "relplot.pl"), os.path.join(tmp_paths["tmp"], file_), os.path.join(tmp_paths["tmp"], dot_file), ">", os.path.join(tmp_paths["tmp"], rel_file)])) def _convert_pdf(self, ps2pdf14_path, tmp_paths, file_, pdf_file): call([ps2pdf14_path, os.path.join(tmp_paths["tmp"], file_), pdf_file]) def _replot_sec_to_pdf(self, vienna_util, tmp_paths, ps2pdf14_path, prefix): for file_ in os.listdir(os.getcwd()): if file_.endswith("ss.ps"): dot_file = file_.replace("ss.ps", "dp.ps") rel_file = file_.replace("ss.ps", "rss.ps") print("replot {0}".format(file_)) self._run_replot(vienna_util, tmp_paths, file_, dot_file, rel_file) for file_ in os.listdir(tmp_paths["tmp"]): if (file_.endswith("rss.ps")) or (file_.endswith("dp.ps")): pdf_file = file_.replace(".ps", ".pdf") print("convert {0} to pdf".format(file_)) self._convert_pdf(ps2pdf14_path, tmp_paths, file_, pdf_file) os.mkdir(os.path.join(tmp_paths["sec"], prefix)) os.mkdir(os.path.join(tmp_paths["dot"], prefix)) self.helper.move_all_content( tmp_paths["tmp"], os.path.join(tmp_paths["sec"], prefix), ["rss.pdf"]) self.helper.move_all_content( tmp_paths["tmp"], os.path.join(tmp_paths["dot"], prefix), ["dp.pdf"]) def _run_mountain(self, vienna_util, tmp_paths, dot_file, out): call([os.path.join(vienna_util, "mountain.pl"), os.path.join(tmp_paths["tmp"], dot_file)], stdout=out) def _plot_mountain(self, mountain, moun_path, tmp_paths, prefix, vienna_util): if mountain: tmp_moun_path = os.path.join(tmp_paths["main"], moun_path) os.mkdir(os.path.join(tmp_moun_path, prefix)) txt_path = os.path.join(tmp_paths["tmp"], "tmp_txt") self.helper.check_make_folder(txt_path) print("Generating mountain plot of {0}....".format(prefix)) for dot_file in os.listdir(tmp_paths["tmp"]): if dot_file.endswith("dp.ps"): moun_txt = os.path.join(tmp_paths["tmp"], "mountain.txt") out = open(moun_txt, "w") moun_file = dot_file.replace("dp.ps", "mountain.pdf") print("Generating {0}".format(moun_file)) self._run_mountain(vienna_util, tmp_paths, dot_file, out) plot_mountain_plot(moun_txt, moun_file) shutil.move(moun_file, os.path.join(tmp_moun_path, prefix, moun_file)) out.close() os.remove(moun_txt) def _compute_2d_and_energy(self, args_srna, prefixs): print("Running energy calculation....") moun_path = os.path.join(args_srna.out_folder, "mountain_plot") sec_path = os.path.join(args_srna.out_folder, "sec_structure", "sec_plot") dot_path = os.path.join(args_srna.out_folder, "sec_structure", "dot_plot") self.helper.remove_all_content(sec_path, None, "dir") self.helper.remove_all_content(dot_path, None, "dir") self.helper.remove_all_content(moun_path, None, "dir") for prefix in prefixs: tmp_paths = self._get_seq_sec( self.fasta_path, args_srna.out_folder, prefix, sec_path, dot_path, args_srna.vienna_path) self._replot_sec_to_pdf(args_srna.vienna_util, tmp_paths, args_srna.ps2pdf14_path, prefix) self._plot_mountain(args_srna.mountain, moun_path, tmp_paths, prefix, args_srna.vienna_util) self.helper.remove_all_content(os.getcwd(), ".ps", "file") os.chdir(tmp_paths["main"]) shutil.move("_".join([self.prefixs["energy"], prefix]), "_".join([self.prefixs["basic"], prefix])) shutil.rmtree(os.path.join(args_srna.out_folder, "tmp_srna")) def _run_blast(self, blast_path, program, database, e, seq_file, blast_file, strand): call([os.path.join(blast_path, program), "-db", database, "-evalue", str(e), "-strand", strand, "-query", seq_file, "-out", blast_file]) def _get_strand_fasta(self, seq_file, out_folder): tmp_plus = os.path.join(out_folder, "tmp_plus.fa") tmp_minus = os.path.join(out_folder, "tmp_minus.fa") out_p = open(tmp_plus, "w") out_m = open(tmp_minus, "w") strand = "" with open(seq_file) as sh: for line in sh: line = line.strip() if line.startswith(">"): if line[-1] == "+": out_p.write(line + "\n") strand = "plus" elif line[-1] == "-": out_m.write(line + "\n") strand = "minus" else: if strand == "plus": out_p.write(line + "\n") elif strand == "minus": out_m.write(line + "\n") out_p.close() out_m.close() return tmp_plus, tmp_minus def _blast(self, database, database_format, data_type, args_srna, prefixs, program, database_type, e): if (database is None): print("Error: No database assigned!") else: if database_format: self._formatdb(database, data_type, args_srna.out_folder, args_srna.blast_path, database_type) for prefix in prefixs: blast_file = os.path.join( args_srna.out_folder, "blast_result_and_misc", "_".join([database_type, "blast", prefix + ".txt"])) srna_file = "_".join([self.prefixs["basic"], prefix]) out_file = os.path.join( args_srna.out_folder, "_".join(["tmp", database_type, prefix])) print("Running Blast of {0}".format(prefix)) seq_file = os.path.join( args_srna.out_folder, "_".join(["sRNA_seq", prefix])) if seq_file not in os.listdir(args_srna.out_folder): self.helper.get_seq( srna_file, os.path.join(self.fasta_path, prefix + ".fa"), seq_file) if database_type == "nr": tmp_plus, tmp_minus = self._get_strand_fasta( seq_file, args_srna.out_folder) tmp_blast = os.path.join("tmp_blast.txt") self._run_blast(args_srna.blast_path, program, database, e, tmp_plus, tmp_blast, "plus") self._run_blast(args_srna.blast_path, program, database, e, tmp_minus, blast_file, "minus") self.helper.merge_file(tmp_blast, blast_file) os.remove(tmp_blast) os.remove(tmp_plus) os.remove(tmp_minus) else: self._run_blast(args_srna.blast_path, program, database, e, seq_file, blast_file, "both") extract_blast(blast_file, srna_file, out_file, out_file + ".csv", database_type) shutil.move(out_file, srna_file) def _class_srna(self, prefixs, args_srna): if (len(args_srna.import_info) != 1) or ( len(args_srna.import_info) != 0): for prefix in prefixs: print("classifying sRNA of {0}".format(prefix)) class_gff = os.path.join(self.gff_output, "for_class") class_table = os.path.join(self.table_output, "for_class") self.helper.check_make_folder(os.path.join(class_table, prefix)) self.helper.check_make_folder(os.path.join(class_gff, prefix)) class_gff = os.path.join(class_gff, prefix) class_table = os.path.join(class_table, prefix) self.helper.check_make_folder(class_table) self.helper.check_make_folder(class_gff) out_stat = os.path.join( self.stat_path, "_".join([ "stat_sRNA_class", prefix + ".csv"])) classify_srna(os.path.join(self.all_best["all_gff"], "_".join([prefix, "sRNA.gff"])), class_gff, out_stat, args_srna) for srna in os.listdir(class_gff): out_table = os.path.join( class_table, srna.replace(".gff", ".csv")) gen_srna_table( os.path.join(class_gff, srna), "_".join([self.prefixs["merge_table"], prefix]), "_".join([self.tmps["nr"], prefix + ".csv"]), "_".join([self.tmps["srna"], prefix + ".csv"]), args_srna, out_table) def _get_best_result(self, prefixs, args_srna): for prefix in prefixs: best_gff = os.path.join(self.all_best["best_gff"], "_".join([prefix, "sRNA.gff"])) best_table = os.path.join(self.all_best["best_table"], "_".join([prefix, "sRNA.csv"])) gen_best_srna(os.path.join(self.all_best["all_gff"], "_".join([prefix, "sRNA.gff"])), best_gff, args_srna) gen_srna_table(os.path.join(self.all_best["best_gff"], "_".join([prefix, "sRNA.gff"])), "_".join([self.prefixs["merge_table"], prefix]), "_".join([self.tmps["nr"], prefix + ".csv"]), "_".join([self.tmps["srna"], prefix + ".csv"]), args_srna, best_table) def _remove_file(self, args_srna): self.helper.remove_all_content(args_srna.out_folder, "tmp_", "dir") self.helper.remove_all_content(args_srna.out_folder, "tmp_", "file") self.helper.remove_tmp(args_srna.fastas) self.helper.remove_tmp(args_srna.gffs) if args_srna.frag_wigs is not None: self.helper.remove_tmp(args_srna.frag_wigs) if args_srna.tex_wigs is not None: self.helper.remove_tmp(args_srna.tex_wigs) if (args_srna.frag_wigs is not None) and ( args_srna.tex_wigs is not None): shutil.rmtree(args_srna.merge_wigs) self.helper.remove_tmp(args_srna.trans) if args_srna.tss_folder is not None: self.helper.remove_tmp(args_srna.tss_folder) if args_srna.pro_folder is not None: self.helper.remove_tmp(args_srna.pro_folder) if args_srna.sorf_file is not None: self.helper.remove_tmp(args_srna.sorf_file) if "tmp_median" in os.listdir(args_srna.out_folder): os.remove(os.path.join(args_srna.out_folder, "tmp_median")) if self.term_path is not None: self.helper.remove_tmp(args_srna.terms) def _filter_srna(self, args_srna, prefixs): if "sec_str" in args_srna.import_info: self._compute_2d_and_energy(args_srna, prefixs) if "blast_nr" in args_srna.import_info: self._blast(args_srna.nr_database, args_srna.nr_format, "prot", args_srna, prefixs, "blastx", "nr", args_srna.e_nr) if "blast_srna" in args_srna.import_info: self._blast(args_srna.srna_database, args_srna.srna_format, "nucl", args_srna, prefixs, "blastn", "sRNA", args_srna.e_srna) if "sorf" in args_srna.import_info: for prefix in prefixs: if ("_".join([prefix, "sORF.gff"]) in os.listdir(self.sorf_path)): tmp_srna = os.path.join(args_srna.out_folder, "".join(["tmp_srna_sorf", prefix])) tmp_sorf = os.path.join(args_srna.out_folder, "".join(["tmp_sorf_srna", prefix])) srna_sorf_comparison( "_".join([self.prefixs["basic"], prefix]), os.path.join(self.sorf_path, "_".join([prefix, "sORF.gff"])), tmp_srna, tmp_sorf) os.remove(tmp_sorf) shutil.move(tmp_srna, "_".join([self.prefixs["basic"], prefix])) def _import_info_format(self, import_info): new_info = [] for info in import_info: info = info.lower() new_info.append(info) return new_info def _gen_table(self, prefixs, args_srna): for prefix in prefixs: out_table = os.path.join(self.all_best["all_table"], "_".join([prefix, "sRNA.csv"])) gen_srna_table(os.path.join(self.all_best["all_gff"], "_".join([prefix, "sRNA.gff"])), "_".join([self.prefixs["merge_table"], prefix]), "_".join([self.tmps["nr"], prefix + ".csv"]), "_".join([self.tmps["srna"], prefix + ".csv"]), args_srna, out_table) def _print_rank_all(self, prefixs): for prefix in prefixs: all_table = os.path.join(self.all_best["all_table"], "_".join([prefix, "sRNA.csv"])) best_table = os.path.join(self.all_best["best_table"], "_".join([prefix, "sRNA.csv"])) print_rank_all(all_table, best_table) def _filter_min_utr(self, prefixs, min_utr): for prefix in prefixs: filter_utr(os.path.join(self.all_best["all_gff"], "_".join([prefix, "sRNA.gff"])), os.path.join(self.all_best["all_table"], "_".join([prefix, "sRNA.csv"])), min_utr) def _antisense(self, gffs, prefixs): for prefix in prefixs: all_table = os.path.join(self.all_best["all_table"], "_".join([prefix, "sRNA.csv"])) best_table = os.path.join(self.all_best["best_table"], "_".join([prefix, "sRNA.csv"])) all_gff = os.path.join(self.all_best["all_gff"], "_".join([prefix, "sRNA.gff"])) best_gff = os.path.join(self.all_best["best_gff"], "_".join([prefix, "sRNA.gff"])) srna_antisense(all_gff, all_table, os.path.join(gffs, prefix + ".gff")) srna_antisense(best_gff, best_table, os.path.join(gffs, prefix + ".gff")) def _blast_stat(self, stat_path, srna_tables): for srna_table in os.listdir(os.path.join(srna_tables, "best")): out_srna_blast = os.path.join( stat_path, "stat_" + srna_table.replace(".csv", "_blast.csv")) blast_class(os.path.join(srna_tables, "best", srna_table), out_srna_blast) def _compare_term_promoter(self, out_table, prefix, args_srna): if ("term" in args_srna.import_info) and ( self.term_path is not None): compare_srna_term(os.path.join(self.all_best["all_gff"], "_".join([prefix, "sRNA.gff"])), out_table, os.path.join(self.term_path, "_".join([prefix, "term.gff"])), args_srna.fuzzy_b, args_srna.fuzzy_a) if ("promoter" in args_srna.import_info) and ( args_srna.promoter_table is not None) and ( "tss" in args_srna.import_info): compare_srna_promoter(os.path.join(self.all_best["all_gff"], "_".join([prefix, "sRNA.gff"])), out_table, args_srna) def run_srna_detection(self, args_srna): self._check_necessary_file(args_srna) self.multiparser.parser_gff(args_srna.trans, "transcript") self.multiparser.combine_gff(args_srna.gffs, self.tran_path, None, "transcript") args_srna.import_info = self._import_info_format(args_srna.import_info) prefixs = self._run_program(args_srna) self._filter_srna(args_srna, prefixs) for prefix in prefixs: shutil.copyfile("_".join([self.prefixs["basic"], prefix]), os.path.join(self.all_best["all_gff"], "_".join([prefix, "sRNA.gff"]))) self._compare_term_promoter("_".join([self.prefixs["merge_table"], prefix]), prefix, args_srna) self._gen_table(prefixs, args_srna) self._class_srna(prefixs, args_srna) self._get_best_result(prefixs, args_srna) self._print_rank_all(prefixs) if "blast_srna" in args_srna.import_info: self._blast_stat(self.stat_path, self.table_output) self._remove_file(args_srna)
class Terminator(object): def __init__(self, args_term): self.multiparser = Multiparser() self.helper = Helper() self.converter = Converter() self.gff_parser = Gff3Parser() self.gff_path = os.path.join(args_term.gffs, "tmp") self.fasta_path = os.path.join(args_term.fastas, "tmp") self.tran_path = os.path.join(args_term.trans, "tmp") self.outfolder = {"term": os.path.join(args_term.out_folder, "gffs"), "csv": os.path.join(args_term.out_folder, "tables")} self.terms = {"all": os.path.join(self.outfolder["term"], "all_candidates"), "express": os.path.join(self.outfolder["term"], "express"), "best": os.path.join(self.outfolder["term"], "best"), "non": os.path.join(self.outfolder["term"], "non_express")} self.csvs = {"all": os.path.join(self.outfolder["csv"], "all_candidates"), "express": os.path.join(self.outfolder["csv"], "express"), "best": os.path.join(self.outfolder["csv"], "best"), "non": os.path.join(self.outfolder["csv"], "non_express")} self.combine_path = os.path.join(self.gff_path, "combine") self.tmps = {"transterm": os.path.join(os.getcwd(), "tmp_transterm"), "hp": "transtermhp", "hp_gff": "transtermhp.gff", "hp_path": "tmp_transterm/tmp", "term_table": os.path.join(os.getcwd(), "tmp_term_table"), "merge": os.path.join(os.getcwd(), "tmp_merge_gff"), "gff": "tmp.gff", "folder": os.path.join(os.getcwd(), "tmp")} self.suffixs = {"gff": "term.gff", "csv": "term.csv", "allgff": "term_all.gff"} if args_term.srnas: self.srna_path = os.path.join(args_term.srnas, "tmp") else: self.srna_path = None self._make_gff_folder() def _combine_annotation(self, combine_file, files): with open(combine_file, 'w') as result: for file_ in files: check_start = False fh = open(file_, 'r') for line in fh: if check_start: result.write(line) if "Location" in line: check_start = True if "\n" not in line: result.write("\n") fh.close() def _make_gff_folder(self): self.helper.check_make_folder(self.terms["all"]) self.helper.check_make_folder(self.csvs["all"]) self.helper.check_make_folder(self.terms["best"]) self.helper.check_make_folder(self.csvs["best"]) self.helper.check_make_folder(self.terms["express"]) self.helper.check_make_folder(self.csvs["express"]) self.helper.check_make_folder(self.terms["non"]) self.helper.check_make_folder(self.csvs["non"]) def _convert_gff2rntptt(self, gff_path, fasta_path, sRNAs): file_types = {} prefixs = [] for gff in os.listdir(gff_path): if gff.endswith(".gff"): filename = gff.split("/") prefix = filename[-1][:-4] prefixs.append(prefix) gff_file = os.path.join(gff_path, gff) rnt_file = os.path.join(gff_path, gff.replace(".gff", ".rnt")) ptt_file = os.path.join(gff_path, gff.replace(".gff", ".ptt")) fasta = self.helper.get_correct_file( fasta_path, ".fa", prefix, None, None) if not fasta: print("Error: no proper file - {0}.fa".format(prefix)) sys.exit() if sRNAs: self.multiparser.parser_gff(sRNAs, "sRNA") srna = self.helper.get_correct_file( self.srna_path, "_sRNA.gff", prefix, None, None) if (srna) and (fasta): self.converter.convert_gff2rntptt( gff_file, fasta, ptt_file, rnt_file, srna, srna.replace(".gff", ".rnt")) file_types[prefix] = "srna" if (not srna) and (fasta): self.converter.convert_gff2rntptt( gff_file, fasta, ptt_file, rnt_file, None, None) file_types[prefix] = "normal" else: self.converter.convert_gff2rntptt( gff_file, fasta, ptt_file, rnt_file, None, None) file_types[prefix] = "normal" return file_types, prefixs def _combine_ptt_rnt(self, gff_path, file_types, srna_path): self.helper.check_make_folder(self.combine_path) for prefix, file_type in file_types.items(): combine_file = os.path.join(self.combine_path, prefix + '.ptt') if file_type == "normal": files = [os.path.join(gff_path, prefix + ".ptt"), os.path.join(gff_path, prefix + ".rnt")] self._combine_annotation(combine_file, files) elif file_type == "srna": files = [os.path.join(gff_path, prefix + ".ptt"), os.path.join(gff_path, prefix + ".rnt"), os.path.join(srna_path, "_".join([prefix, "sRNA.rnt"]))] self._combine_annotation(combine_file, files) def _TransTermHP(self, fasta, file_, out_path, prefix, out, args_term): call([args_term.TransTermHP_path, "-p", args_term.expterm_path, fasta, os.path.join(self.combine_path, file_), "--t2t-perf", os.path.join(out_path, "_".join([ prefix, "terminators_within_robust_tail-to-tail_regions.t2t"])), "--bag-output", os.path.join(out_path, "_".join([ prefix, "best_terminator_after_gene.bag"]))], stdout=out) def _run_TransTermHP(self, args_term): self.helper.check_make_folder(self.tmps["transterm"]) for file_ in os.listdir(self.combine_path): if ".ptt" in file_: prefix = file_.replace(".ptt", "") fasta = self.helper.get_correct_file( self.fasta_path, ".fa", prefix, None, None) if not fasta: print("Error: no proper file - {0}.fa".format(prefix)) sys.exit() out_path = os.path.join(args_term.hp_folder, prefix) self.helper.check_make_folder(out_path) out = open(os.path.join(out_path, "_".join([prefix, "terminators.txt"])), "w") self._TransTermHP(fasta, file_, out_path, prefix, out, args_term) out.close() shutil.rmtree(self.combine_path) def _convert_to_gff(self, prefixs, args_term): for prefix in prefixs: for folder in os.listdir(args_term.hp_folder): if prefix == folder: out_path = os.path.join(args_term.hp_folder, folder) for file_ in os.listdir(out_path): if file_.endswith(".bag"): out_file = os.path.join( self.tmps["transterm"], "_".join([prefix, self.tmps["hp_gff"]])) self.converter.convert_transtermhp2gff( os.path.join(out_path, file_), out_file) self.multiparser.combine_gff(args_term.gffs, self.tmps["transterm"], None, self.tmps["hp"]) def _combine_wigs(self, args_term): if (args_term.tex_wigs is not None) and ( args_term.frag_wigs is not None): folder = args_term.tex_wigs.split("/") folder = "/".join(folder[:-1]) merge_wigs = os.path.join(folder, "merge_wigs") self.helper.check_make_folder(merge_wigs) for wig in os.listdir(args_term.tex_wigs): if os.path.isdir(os.path.join(args_term.tex_wigs, wig)): pass else: shutil.copy(os.path.join(args_term.tex_wigs, wig), merge_wigs) for wig in os.listdir(args_term.frag_wigs): if os.path.isdir(os.path.join(args_term.frag_wigs, wig)): pass else: shutil.copy(os.path.join(args_term.frag_wigs, wig), merge_wigs) elif (args_term.tex_wigs is not None): merge_wigs = args_term.tex_wigs elif (args_term.frag_wigs is not None): merge_wigs = args_term.frag_wigs else: print("Error: no proper wig files!!!") sys.exit() return merge_wigs def _merge_sRNA(self, sRNAs, prefixs, gff_path): if sRNAs is not None: self.multiparser.parser_gff(sRNAs, "sRNA") self.helper.check_make_folder(self.tmps["merge"]) for prefix in prefixs: tmp_gff = os.path.join(self.tmps["merge"], self.tmps["gff"]) if self.tmps["gff"] in os.listdir(self.tmps["merge"]): os.remove(tmp_gff) self.helper.merge_file(os.path.join(gff_path, prefix + ".gff"), tmp_gff) self.helper.merge_file(os.path.join( self.srna_path, "_".join([prefix, "sRNA.gff"])), tmp_gff) self.helper.sort_gff(tmp_gff, os.path.join( self.tmps["merge"], prefix + ".gff")) os.remove(tmp_gff) merge_path = self.tmps["merge"] else: merge_path = gff_path return merge_path def _move_file(self, term_outfolder, csv_outfolder): for gff in os.listdir(term_outfolder): if gff.endswith("_term.gff"): self.helper.sort_gff(os.path.join(term_outfolder, gff), self.tmps["gff"]) shutil.move(self.tmps["gff"], os.path.join(term_outfolder, gff)) prefix = gff.replace("_term.gff", "") new_gff = os.path.join(self.terms["all"], "_".join([ prefix, self.suffixs["allgff"]])) csv_file = os.path.join( os.path.join(self.csvs["all"], "_".join([ prefix, self.suffixs["csv"]]))) out = open(new_gff, "w") out.write("##gff-version 3\n") out.close() self.helper.merge_file( os.path.join(term_outfolder, gff), os.path.join( self.terms["all"], "_".join([ prefix, self.suffixs["allgff"]]))) os.remove(os.path.join(term_outfolder, gff)) pre_strain = "" if ("_".join([prefix, self.suffixs["csv"]]) in os.listdir(self.csvs["all"])): os.remove(csv_file) out_csv = open(csv_file, "w") out_csv.write("\t".join(["strain", "name", "start", "end", "strand", "detect", "coverage_detail"]) + "\n") out_csv.close() fh = open(new_gff) for entry in self.gff_parser.entries(fh): if entry.seq_id != pre_strain: self.helper.merge_file(os.path.join( self.tmps["term_table"], "_".join([ entry.seq_id, "term_raw.csv"])), os.path.join(self.csvs["all"], "_".join([ prefix, self.suffixs["csv"]]))) pre_strain = entry.seq_id fh.close() def _run_rnafold(self, RNAfold_path, tmp_seq, tmp_sec, prefix): print("Computing secondray structure of {0}".format(prefix)) self.helper.check_make_folder(self.tmps["folder"]) pre_cwd = os.getcwd() os.chdir(self.tmps["folder"]) os.system(" ".join([RNAfold_path, "<", os.path.join("..", tmp_seq), ">", os.path.join("..", tmp_sec)])) os.chdir(pre_cwd) shutil.rmtree(self.tmps["folder"]) def _compute_intersection_forward_reverse( self, prefixs, merge_path, wig_path, merge_wigs, args_term): for prefix in prefixs: tmp_seq = os.path.join(args_term.out_folder, "_".join(["inter_seq", prefix])) tmp_sec = os.path.join(args_term.out_folder, "_".join(["inter_sec", prefix])) tran_file = os.path.join(self.tran_path, "_".join([prefix, "transcript.gff"])) gff_file = os.path.join(merge_path, prefix + ".gff") print("Extracting seq of {0}".format(prefix)) intergenic_seq(os.path.join(self.fasta_path, prefix + ".fa"), tran_file, gff_file, tmp_seq) self._run_rnafold(args_term.RNAfold_path, tmp_seq, tmp_sec, prefix) tmp_cand = os.path.join(args_term.out_folder, "_".join(["term_candidates", prefix])) poly_t(tmp_seq, tmp_sec, gff_file, tran_file, tmp_cand, args_term) print("detection of terminator") detect_coverage( tmp_cand, os.path.join(merge_path, prefix + ".gff"), os.path.join(self.tran_path, "_".join([ prefix, "transcript.gff"])), os.path.join(self.fasta_path, prefix + ".fa"), os.path.join(wig_path, "_".join([prefix, "forward.wig"])), os.path.join(wig_path, "_".join([prefix, "reverse.wig"])), os.path.join(self.tmps["hp_path"], "_".join([ prefix, self.tmps["hp_gff"]])), merge_wigs, os.path.join(self.outfolder["term"], "_".join([ prefix, self.suffixs["gff"]])), os.path.join(self.tmps["term_table"], "_".join([ prefix, "term_raw.csv"])), args_term) self.multiparser.combine_gff(args_term.gffs, self.outfolder["term"], None, "term") self._move_file(self.outfolder["term"], self.outfolder["csv"]) def _remove_tmp_file(self, merge_wigs, args_term): self.helper.remove_tmp(args_term.gffs) self.helper.remove_tmp(args_term.fastas) if args_term.srnas is not None: self.helper.remove_tmp(args_term.srnas) shutil.rmtree(self.tmps["merge"]) if (args_term.tex_wigs is not None) and ( args_term.frag_wigs is not None): shutil.rmtree(merge_wigs) self.helper.remove_tmp(args_term.trans) self.helper.remove_tmp(args_term.tex_wigs) self.helper.remove_tmp(args_term.frag_wigs) self.helper.remove_tmp(self.outfolder["term"]) shutil.rmtree(self.tmps["transterm"]) shutil.rmtree(self.tmps["term_table"]) self.helper.remove_all_content(args_term.out_folder, "inter_seq_", "file") self.helper.remove_all_content(args_term.out_folder, "inter_sec_", "file") self.helper.remove_all_content(args_term.out_folder, "term_candidates_", "file") def _compute_stat(self, args_term): new_prefixs = [] for gff in os.listdir(self.terms["all"]): if gff.endswith("_term_all.gff"): out_tmp = open(self.tmps["gff"], "w") out_tmp.write("##gff-version 3\n") new_prefix = gff.replace("_term_all.gff", "") new_prefixs.append(gff.replace("_term_all.gff", "")) num = 0 fh = open(os.path.join(self.terms["all"], gff)) for entry in self.gff_parser.entries(fh): name = '%0*d' % (5, num) entry.attributes["ID"] = "term" + str(num) entry.attributes["Name"] = "_".join(["Terminator_" + name]) entry.attribute_string = ";".join([ "=".join(items) for items in entry.attributes.items()]) out_tmp.write("\t".join([entry.info_without_attributes, entry.attribute_string]) + "\n") num += 1 out_tmp.close() fh.close() shutil.move(self.tmps["gff"], os.path.join(self.terms["all"], "_".join([new_prefix, self.suffixs["gff"]]))) if args_term.stat: stat_path = os.path.join(args_term.out_folder, "statistics") for prefix in new_prefixs: stat_term(os.path.join(self.terms["all"], "_".join([prefix, self.suffixs["gff"]])), os.path.join(self.csvs["all"], "_".join([prefix, self.suffixs["csv"]])), os.path.join(stat_path, "_".join(["stat", prefix + ".csv"])), os.path.join(self.terms["best"], "_".join([prefix, "term"])), os.path.join(self.terms["express"], "_".join([prefix, "term"])), os.path.join(self.terms["non"], "_".join([prefix, "term"]))) shutil.move(os.path.join(self.terms["best"], "_".join([prefix, self.suffixs["csv"]])), os.path.join(self.csvs["best"], "_".join([prefix, self.suffixs["csv"]]))) shutil.move(os.path.join(self.terms["express"], "_".join([prefix, self.suffixs["csv"]])), os.path.join(self.csvs["express"], "_".join([prefix, self.suffixs["csv"]]))) shutil.move(os.path.join(self.terms["non"], "_".join([prefix, self.suffixs["csv"]])), os.path.join(self.csvs["non"], "_".join([prefix, self.suffixs["csv"]]))) os.remove(os.path.join(self.terms["all"], "_".join([prefix, self.suffixs["allgff"]]))) def _check_gff_file(self, folder): for file_ in os.listdir(folder): if file_.endswith(".gff"): self.helper.check_uni_attributes(os.path.join(folder, file_)) def _compare_term_tran(self, args_term): self.multiparser.combine_gff(args_term.gffs, self.tran_path, None, "transcript") for type_ in ("best", "express", "all_candidates"): compare_term_tran(self.tran_path, os.path.join(self.outfolder["term"], type_), args_term.fuzzy_up_ta, args_term.fuzzy_down_ta, args_term.out_folder, "terminator") shutil.move( os.path.join( args_term.out_folder, "statistics", "stat_comparison_terminator_transcript.csv"), os.path.join( args_term.out_folder, "statistics", "stat_comparison_terminator_transcript_" + type_ + ".csv")) def run_terminator(self, args_term): self._check_gff_file(args_term.gffs) self._check_gff_file(args_term.trans) self.multiparser.parser_fasta(args_term.fastas) if (not args_term.gffs) or (not args_term.fastas): print("Error: please assign gff annotation folder " "and fasta folder!!!") sys.exit() file_types, prefixs = self._convert_gff2rntptt( self.gff_path, self.fasta_path, args_term.srnas) self._combine_ptt_rnt(self.gff_path, file_types, self.srna_path) self._run_TransTermHP(args_term) self._convert_to_gff(prefixs, args_term) self.helper.remove_tmp(self.gff_path) self.multiparser.parser_gff(args_term.trans, "transcript") self.helper.check_make_folder(self.tmps["term_table"]) self.multiparser.parser_gff(self.tmps["transterm"], self.tmps["hp"]) merge_path = self._merge_sRNA(args_term.srnas, prefixs, self.gff_path) self._compute_intersection_forward_reverse( prefixs, merge_path, args_term.wig_path, args_term.merge_wigs, args_term) self._compute_stat(args_term) self._compare_term_tran(args_term) self._remove_tmp_file(args_term.merge_wigs, args_term)
class Ribos(object): '''detection of riboswitch and RNA thermometer''' def __init__(self, args_ribo): self.multiparser = Multiparser() self.helper = Helper() self.gff_parser = Gff3Parser() self.gff_path = os.path.join(args_ribo.gffs, "tmp") if args_ribo.tsss is not None: self.tss_path = os.path.join(args_ribo.tsss, "tmp") else: self.tss_path = None self.tran_path = os.path.join(args_ribo.trans, "tmp") self.fasta_path = os.path.join(args_ribo.fastas, "tmp") if (args_ribo.program == "both") or ( args_ribo.program == "riboswitch"): (self.ribos_stat_folder, self.ribos_gff_outfolder, self.ribos_table_folder, self.ribos_scan_folder, self.ribos_tmp_files, self.ribos_rfam, self.ribos_suffixs) = self._create_out_folders( args_ribo.ribos_out_folder, "riboswitch", args_ribo.database) if (args_ribo.program == "both") or ( args_ribo.program == "thermometer"): (self.thermo_stat_folder, self.thermo_gff_outfolder, self.thermo_table_folder, self.thermo_scan_folder, self.thermo_tmp_files, self.thermo_rfam, self.thermo_suffixs) = self._create_out_folders( args_ribo.thermo_out_folder, "RNA_thermometer", args_ribo.database) def _create_out_folders(self, out_folder, feature, database): stat_folder = os.path.join(out_folder, "statistics") gff_outfolder = os.path.join(out_folder, "gffs") table_folder = os.path.join(out_folder, "tables") scan_folder = os.path.join(out_folder, "scan_Rfam_results") tmp_files = {"fasta": os.path.join( out_folder, "tmp_fasta"), "scan": os.path.join( out_folder, "tmp_scan"), "table": os.path.join( out_folder, "tmp_table")} rfam = os.path.join(database, "Rfam_" + feature + ".cm") suffixs = {"csv": feature + ".csv", "txt": feature + "_prescan.txt", "re_txt": feature + "_scan.txt", "re_csv": feature + "_scan.csv"} return (stat_folder, gff_outfolder, table_folder, scan_folder, tmp_files, rfam, suffixs) def _run_cmscan(self, args_ribo, seq, type_, prefix, tmp_files, suffixs, rfam, log): scan_file = os.path.join(tmp_files["scan"], "_".join([prefix, suffixs[type_]])) scan = open(scan_file, "w") if args_ribo.cutoff.split("_")[0] == "e": value = args_ribo.cutoff.split("_")[-1] log.write(" ".join([args_ribo.cmscan_path, "--incE", value, "--acc", rfam, seq]) + "\n") call([args_ribo.cmscan_path, "--incE", value, "--acc", rfam, seq], stdout=scan) elif args_ribo.cutoff.split("_")[0] == "s": value = args_ribo.cutoff.split("_")[-1] log.write(" ".join([args_ribo.cmscan_path, "--incT", value, "--acc", rfam, seq]) + "\n") call([args_ribo.cmscan_path, "--incT", value, "--acc", rfam, seq], stdout=scan) else: print("Error: the --cutoff needs to start from 'e' " "(e value) or 's' (score)!") log.write("the --cutoff needs to start from 'e' " "(e value) or 's' (score).\n") sys.exit() scan.close() log.write("Done!\n") log.write("\t" + scan_file + " is temporary generated.\n") return scan_file def _scan_extract_rfam(self, prefixs, args_ribo, tmp_files, suffixs, feature, rfam, log): '''extract the seq of candidates and scanning the candidates''' for gff in os.listdir(self.gff_path): if gff.endswith(".gff"): prefix = gff.replace(".gff", "") first_seq = os.path.join(tmp_files["fasta"], prefix + ".fa") prefixs.append(prefix) print("Extracting sequences of candidates for {0}".format( prefix)) if self.tss_path is not None: tss_file = os.path.join(self.tss_path, prefix + "_TSS.gff") else: tss_file = None log.write("Running extract_RBS.py to extract potential " "sequences of riboswitches/RNA thermometers for " "{0}.\n".format(prefix)) extract_potential_rbs( os.path.join(self.fasta_path, prefix + ".fa"), os.path.join(self.gff_path, gff), tss_file, os.path.join(self.tran_path, prefix + "_transcript.gff"), first_seq, args_ribo, feature) log.write("\t" + first_seq + " is temporary generated.\n") print("Pre-scanning of {0}".format(prefix)) log.write("Using Infernal to pre-scan riboswitches/RNA " "thermometers for {0}.\n".format(prefix)) log.write("Please make sure the version of Infernal is at least 1.1.1.\n") first_scan_file = self._run_cmscan( args_ribo, first_seq, "txt", prefix, tmp_files, suffixs, rfam, log) sec_seq = os.path.join(tmp_files["fasta"], "_".join([prefix, "regenerate.fa"])) first_table = os.path.join( tmp_files["table"], "_".join([prefix, suffixs["csv"]])) log.write("Running recompute_RBS.py to update the potential " "sequences of riboswitches/RNA thermometers for {0} " "based on the pre-scanning results.\n".format(prefix)) regenerate_seq(first_scan_file, first_seq, first_table, sec_seq) log.write("\t" + sec_seq + " is temporary generated.\n") print("Scanning of {0}".format(prefix)) log.write("Using Infernal to scan riboswitches/RNA " "thermometers for {0}.\n".format(prefix)) log.write("Please make sure the version of Infernal is at " "least 1.1.1.\n") sec_scan_file = self._run_cmscan( args_ribo, sec_seq, "re_txt", prefix, tmp_files, suffixs, rfam, log) sec_table = os.path.join( tmp_files["table"], "_".join([prefix, suffixs["re_csv"]])) log.write("Running recompute_RBS.py and modify_rbs_table.py " "to generate tables for {0} " "based on the scanning results.\n".format(prefix)) reextract_rbs(sec_scan_file, first_table, sec_table, args_ribo.cutoff) shutil.move(sec_table, first_table) modify_table(first_table, args_ribo.output_all) return prefixs def _merge_results(self, args_ribo, scan_folder, suffixs, tmp_files, table_folder, stat_folder, feature_id, gff_outfolder, feature, log): '''merge the results from the results of two searching''' for gff in os.listdir(args_ribo.gffs): if gff.endswith(".gff"): prefix = gff.replace(".gff", "") print("Merging results of {0}".format(prefix)) pre_strain = "" self.helper.check_make_folder(os.path.join( scan_folder, prefix)) fh = open(os.path.join(args_ribo.gffs, gff)) log.write("Merging the results from Infernal to generate " "tables for {0}.\n".format(prefix)) for entry in self.gff_parser.entries(fh): if entry.seq_id != pre_strain: if len(pre_strain) == 0: shutil.copyfile(os.path.join( tmp_files["table"], "_".join([entry.seq_id, suffixs["csv"]])), os.path.join( table_folder, "_".join([prefix, suffixs["csv"]]))) else: self.helper.merge_file(os.path.join( tmp_files["table"], "_".join([entry.seq_id, suffixs["csv"]])), os.path.join( table_folder, "_".join([prefix, suffixs["csv"]]))) shutil.copy(os.path.join( tmp_files["scan"], "_".join([entry.seq_id, suffixs["txt"]])), os.path.join(scan_folder, prefix)) shutil.copy(os.path.join( tmp_files["scan"], "_".join([entry.seq_id, suffixs["re_txt"]])), os.path.join(scan_folder, prefix)) pre_strain = entry.seq_id log.write("The following files are generated.\n") for folder in (table_folder, scan_folder): for file_ in os.listdir(folder): log.write("\t" + os.path.join(folder, file_) + "\n") out_stat = os.path.join( stat_folder, "_".join(["stat", prefix, feature + ".txt"])) print("Computing statistics of {0}".format(prefix)) log.write("Running ribo_gff.py to do statistics and generate " "gff files for {0}.\n".format(prefix)) log.write("The following files are generated:\n") out_gff = os.path.join(gff_outfolder, "_".join([ prefix, feature + ".gff"])) stat_and_covert2gff(os.path.join( table_folder, "_".join([prefix, suffixs["csv"]])), feature_id, out_gff, args_ribo.fuzzy, out_stat, feature) log.write("\t" + out_gff + "\n") log.write("\t" + out_stat + "\n") fh.close() def _remove_tmp(self, args_ribo): self.helper.remove_tmp_dir(args_ribo.gffs) self.helper.remove_tmp_dir(args_ribo.fastas) self.helper.remove_tmp_dir(args_ribo.trans) self.helper.remove_tmp_dir(args_ribo.tsss) def _remove_overlap(self, gff_path, tmp_files, suffixs, type_, fuzzy, log): log.write("Running rbs_overlap.py to remove the overlapping " "riboswitches/RNA thermometers.\n") for gff in os.listdir(gff_path): if gff.endswith(".gff"): tmp_table = os.path.join(os.path.join( tmp_files["table"], "_".join([ gff.replace(".gff", ""), suffixs["csv"]]))) rbs_overlap(tmp_table, os.path.join(gff_path, gff), type_, fuzzy) log.write("\t" + tmp_table + " is updated.\n") def _core_prediction(self, args_ribo, feature_id, rfam, tmp_files, table_folder, feature, scan_folder, suffixs, stat_folder, gff_outfolder, out_folder, type_, log): '''main part of detection''' log.write("Running get_Rfam_ribo.py to get the information of " "riboswitches/RNA thermometers from Rfam.\n") rbs_from_rfam(feature_id, args_ribo.rfam, rfam) log.write("Using Infernal to compress the Rfam data of " "riboswitches/RNA thermometers.\n") log.write("Please make sure the version of Infernal is at least 1.1.1.\n") print("Compressing Rfam of " + feature) log.write(" ".join([args_ribo.cmpress_path, "-F", rfam]) + "\n") call([args_ribo.cmpress_path, "-F", rfam]) log.write("Done!\n") prefixs = [] self.helper.check_make_folder(tmp_files["fasta"]) self.helper.check_make_folder(tmp_files["scan"]) self.helper.check_make_folder(tmp_files["table"]) prefixs = self._scan_extract_rfam( prefixs, args_ribo, tmp_files, suffixs, feature, rfam, log) self._remove_overlap(self.gff_path, tmp_files, suffixs, type_, args_ribo.fuzzy, log) self._merge_results(args_ribo, scan_folder, suffixs, tmp_files, table_folder, stat_folder, feature_id, gff_outfolder, feature, log) log.write("Running map_ribos.py to extract all the details from Rfam.\n") mapping_ribos(table_folder, feature_id, feature) log.write("The following files are updated:\n") for file_ in os.listdir(table_folder): log.write("\t" + os.path.join(table_folder, file_) + "\n") self.helper.remove_all_content(out_folder, "tmp", "dir") def run_ribos(self, args_ribo, log_t, log_r): if args_ribo.fuzzy_rbs > 6: if log_t is not None: log_t.write("--fuzzy_rbs should be equal or less than 6!\n") if log_r is not None: log_r.write("--fuzzy_rbs should be equal or less than 6!\n") print("Error: --fuzzy_rbs should be equal or less than 6!") sys.exit() self.multiparser.parser_gff(args_ribo.gffs, None) self.multiparser.parser_fasta(args_ribo.fastas) self.multiparser.parser_gff(args_ribo.trans, "transcript") if args_ribo.tsss is not None: self.multiparser.parser_gff(args_ribo.tsss, "TSS") for gff in os.listdir(args_ribo.gffs): if gff.endswith(".gff"): self.helper.check_uni_attributes(os.path.join( args_ribo.gffs, gff)) if (args_ribo.program.lower() == "both") or ( args_ribo.program.lower() == "riboswitch"): print("Detecting riboswtiches now") self._core_prediction( args_ribo, args_ribo.ribos_id, self.ribos_rfam, self.ribos_tmp_files, self.ribos_table_folder, "riboswitch", self.ribos_scan_folder, self.ribos_suffixs, self.ribos_stat_folder, self.ribos_gff_outfolder, args_ribo.ribos_out_folder, "riboswitch", log_r) if (args_ribo.program.lower() == "both") or ( args_ribo.program.lower() == "thermometer"): print("Detecting RNA thermometers now") self._core_prediction( args_ribo, args_ribo.thermo_id, self.thermo_rfam, self.thermo_tmp_files, self.thermo_table_folder, "RNA_thermometer", self.thermo_scan_folder, self.thermo_suffixs, self.thermo_stat_folder, self.thermo_gff_outfolder, args_ribo.thermo_out_folder, "thermometer", log_t) self._remove_tmp(args_ribo)
class Ribos(object): def __init__(self, args_ribo): self.multiparser = Multiparser() self.helper = Helper() self.gff_parser = Gff3Parser() self.gff_path = os.path.join(args_ribo.gffs, "tmp") self.tss_path = os.path.join(args_ribo.tsss, "tmp") self.tran_path = os.path.join(args_ribo.trans, "tmp") self.fasta_path = os.path.join(args_ribo.fastas, "tmp") self.stat_folder = os.path.join(args_ribo.out_folder, "statistics") self.gff_outfolder = os.path.join(args_ribo.out_folder, "gffs") self.table_folder = os.path.join(args_ribo.out_folder, "tables") self.scan_folder = os.path.join(args_ribo.out_folder, "scan_Rfam") self.ribos_rfam = os.path.join(args_ribo.database, "Rfam_riboswitch.cm") self.tmp_files = { "fasta": os.path.join(args_ribo.out_folder, "tmp_fasta"), "scan": os.path.join(args_ribo.out_folder, "tmp_scan"), "table": os.path.join(args_ribo.out_folder, "tmp_table") } self.suffixs = { "csv": "riboswitch.csv", "txt": "riboswitch_prescan.txt", "re_txt": "riboswitch_scan.txt", "re_csv": "riboswitch_scan.csv" } def _run_infernal(self, args_ribo, seq, type_, prefix): scan_file = os.path.join(self.tmp_files["scan"], "_".join([prefix, self.suffixs[type_]])) scan = open(scan_file, "w") call([ os.path.join(args_ribo.infernal_path, "cmscan"), "--incE", str(args_ribo.e_value), "--acc", self.ribos_rfam, seq ], stdout=scan) scan.close() return scan_file def _scan_extract_rfam(self, prefixs, args_ribo): for gff in os.listdir(self.gff_path): if gff.endswith(".gff"): prefix = gff.replace(".gff", "") first_seq = os.path.join(self.tmp_files["fasta"], prefix + ".fa") prefixs.append(prefix) print("extracting seq of riboswitch candidates of {0}".format( prefix)) extract_potential_rbs( os.path.join(self.fasta_path, prefix + ".fa"), os.path.join(self.gff_path, gff), os.path.join(self.tss_path, prefix + "_TSS.gff"), os.path.join(self.tran_path, prefix + "_transcript.gff"), first_seq, args_ribo) print("pre-scanning of {0}".format(prefix)) first_scan_file = self._run_infernal(args_ribo, first_seq, "txt", prefix) sec_seq = os.path.join(self.tmp_files["fasta"], "_".join([prefix, "regenerate.fa"])) first_table = os.path.join( self.tmp_files["table"], "_".join([prefix, self.suffixs["csv"]])) regenerate_seq(first_scan_file, first_seq, first_table, sec_seq) print("scanning of {0}".format(prefix)) sec_scan_file = self._run_infernal(args_ribo, sec_seq, "re_txt", prefix) sec_table = os.path.join( self.tmp_files["table"], "_".join([prefix, self.suffixs["re_csv"]])) reextract_rbs(sec_scan_file, first_table, sec_table) shutil.move(sec_table, first_table) modify_table(first_table, args_ribo.output_all) return prefixs def _merge_results(self, args_ribo): for gff in os.listdir(args_ribo.gffs): if gff.endswith(".gff"): prefix = gff.replace(".gff", "") print("Merge results of {0}".format(prefix)) pre_strain = "" self.helper.check_make_folder( os.path.join(self.scan_folder, prefix)) fh = open(os.path.join(args_ribo.gffs, gff)) for entry in self.gff_parser.entries(fh): if entry.seq_id != pre_strain: if len(pre_strain) == 0: shutil.copyfile( os.path.join( self.tmp_files["table"], "_".join( [entry.seq_id, self.suffixs["csv"]])), os.path.join( self.table_folder, "_".join([prefix, self.suffixs["csv"]]))) else: self.helper.merge_file( os.path.join( self.tmp_files["table"], "_".join( [entry.seq_id, self.suffixs["csv"]])), os.path.join( self.table_folder, "_".join([prefix, self.suffixs["csv"]]))) shutil.copy( os.path.join( self.tmp_files["scan"], "_".join([entry.seq_id, self.suffixs["txt"]])), os.path.join(self.scan_folder, prefix)) shutil.copy( os.path.join( self.tmp_files["scan"], "_".join( [entry.seq_id, self.suffixs["re_txt"]])), os.path.join(self.scan_folder, prefix)) pre_strain = entry.seq_id out_stat = os.path.join( self.stat_folder, "_".join(["stat", prefix, "riboswitch.txt"])) print("compute statistics of {0}".format(prefix)) stat_and_covert2gff( os.path.join(self.table_folder, "_".join([prefix, self.suffixs["csv"]])), args_ribo.ribos_id, os.path.join(self.gff_outfolder, "_".join([prefix, "riboswitch.gff"])), args_ribo.fuzzy, out_stat) fh.close() def _remove_tmp(self, args_ribo): self.helper.remove_tmp(args_ribo.gffs) self.helper.remove_tmp(args_ribo.fastas) self.helper.remove_all_content(args_ribo.out_folder, "tmp", "dir") def _remove_overlap(self, gff_path): for gff in os.listdir(gff_path): if gff.endswith(".gff"): rbs_overlap( os.path.join( os.path.join( self.tmp_files["table"], "_".join( [gff.replace(".gff", ""), self.suffixs["csv"]]))), os.path.join(gff_path, gff)) def run_ribos(self, args_ribo): if args_ribo.fuzzy_rbs > 6: print("Error: --fuzzy_rbs should be equal or less than 6!!") sys.exit() self.multiparser.parser_gff(args_ribo.gffs, None) self.multiparser.parser_fasta(args_ribo.fastas) self.multiparser.parser_gff(args_ribo.trans, "transcript") self.multiparser.parser_gff(args_ribo.tsss, "TSS") for gff in os.listdir(args_ribo.gffs): if gff.endswith(".gff"): self.helper.check_uni_attributes( os.path.join(args_ribo.gffs, gff)) rbs_from_rfam(args_ribo.ribos_id, args_ribo.rfam, self.ribos_rfam) print("compressing Rfam...") call([ os.path.join(args_ribo.infernal_path, "cmpress"), "-F", self.ribos_rfam ]) prefixs = [] self.helper.check_make_folder(self.tmp_files["fasta"]) self.helper.check_make_folder(self.tmp_files["scan"]) self.helper.check_make_folder(self.tmp_files["table"]) prefixs = self._scan_extract_rfam(prefixs, args_ribo) self._remove_overlap(self.gff_path) self._merge_results(args_ribo) mapping_ribos(self.table_folder, args_ribo.ribos_id) self._remove_tmp(args_ribo)
class sORFDetection(object): '''detection of sORF''' def __init__(self, args_sorf): self.multiparser = Multiparser() self.helper = Helper() if args_sorf.tsss is not None: self.tss_path = os.path.join(args_sorf.tsss, "tmp") else: self.tss_path = None if args_sorf.srnas is not None: self.srna_path = os.path.join(args_sorf.srnas, "tmp") else: self.srna_path = None self.gff_output = os.path.join(args_sorf.out_folder, "gffs") self.table_output = os.path.join(args_sorf.out_folder, "tables") self.tran_path = os.path.join(args_sorf.trans, "tmp") self.fasta_path = os.path.join(args_sorf.fastas, "tmp") self.all_cand = "all_candidates" self.best = "best_candidates" def _check_gff(self, gffs): for gff in os.listdir(gffs): if gff.endswith(".gff"): self.helper.check_uni_attributes(os.path.join(gffs, gff)) def _check_necessary_files(self, args_sorf, log): if (args_sorf.gffs is None) or (args_sorf.trans is None) or ( (args_sorf.tex_wigs is None) and (args_sorf.frag_wigs is None)): print("Error: lack required files!") log.write("genome annotation, transcript file or wiggle files " "are not assigned.\n") sys.exit() if args_sorf.utr_detect: if (args_sorf.tsss is None): print("Error: TSS files are required for UTR derived" " sORF detection!") log.write("TSS files are required for UTR derived" " sORF detection!\n") sys.exit() self._check_gff(args_sorf.gffs) self.multiparser.parser_gff(args_sorf.gffs, None) if args_sorf.tsss is not None: self._check_gff(args_sorf.tsss) self.multiparser.parser_gff(args_sorf.tsss, "TSS") self.multiparser.combine_gff(args_sorf.gffs, self.tss_path, None, "TSS") self._check_gff(args_sorf.trans) if args_sorf.srnas is not None: self._check_gff(args_sorf.srnas) self.multiparser.parser_gff(args_sorf.srnas, "sRNA") self.multiparser.combine_gff(args_sorf.gffs, self.srna_path, None, "sRNA") def _start_stop_codon(self, prefixs, args_sorf, log): '''detect the sORF based on start and stop codon and ribosome binding site''' log.write("Running sORF_detection.py for detecting sORFs.\n") log.write("The following files are generated:\n") for prefix in prefixs: print("Searching sORFs of {0}".format(prefix)) if self.srna_path is not None: srna_file = os.path.join(self.srna_path, "_".join([prefix, "sRNA.gff"])) else: srna_file = None if self.tss_path is not None: tss_file = os.path.join(self.tss_path, "_".join([prefix, "TSS.gff"])) else: tss_file = None sorf_detection(os.path.join(self.fasta_path, prefix + ".fa"), srna_file, os.path.join(args_sorf.out_folder, "_".join([prefix, "inter.gff"])), tss_file, os.path.join(args_sorf.wig_path, "_".join([prefix, "forward.wig"])), os.path.join(args_sorf.wig_path, "_".join([prefix, "reverse.wig"])), os.path.join(self.gff_output, self.all_cand, "_".join([prefix, "sORF"])), args_sorf) if "_".join([prefix, "sORF_all.gff"]) in os.listdir( os.path.join(self.gff_output, self.all_cand)): gff_all = os.path.join(self.gff_output, self.all_cand, "_".join([prefix, "sORF.gff"])) gff_best = os.path.join(self.gff_output, self.best, "_".join([prefix, "sORF.gff"])) csv_all = os.path.join(self.table_output, self.all_cand, "_".join([prefix, "sORF.csv"])) csv_best = os.path.join(self.table_output, self.best, "_".join([prefix, "sORF.csv"])) shutil.move(os.path.join(self.gff_output, self.all_cand, "_".join([prefix, "sORF_all.gff"])), gff_all) shutil.move(os.path.join(self.gff_output, self.all_cand, "_".join([prefix, "sORF_best.gff"])), gff_best) shutil.move(os.path.join(self.gff_output, self.all_cand, "_".join([prefix, "sORF_all.csv"])), csv_all) shutil.move(os.path.join(self.gff_output, self.all_cand, "_".join([prefix, "sORF_best.csv"])), csv_best) log.write("\t" + gff_all + "\n") log.write("\t" + gff_best + "\n") log.write("\t" + csv_all + "\n") log.write("\t" + csv_best + "\n") def _remove_tmp(self, args_sorf): self.helper.remove_all_content(args_sorf.out_folder, ".gff", "file") self.helper.remove_tmp_dir(args_sorf.fastas) self.helper.remove_tmp_dir(args_sorf.gffs) self.helper.remove_tmp_dir(args_sorf.tsss) self.helper.remove_tmp_dir(args_sorf.trans) self.helper.remove_tmp_dir(args_sorf.srnas) if "temp_wig" in os.listdir(args_sorf.out_folder): shutil.rmtree(os.path.join(args_sorf.out_folder, "temp_wig")) if "merge_wigs" in os.listdir(args_sorf.out_folder): shutil.rmtree(os.path.join(args_sorf.out_folder, "merge_wigs")) def _compare_tran_cds(self, args_sorf, log): '''compare transcript and CDS to find the intergenic region''' prefixs = [] log.write("Running sORF_intergenic.py to extract the sequences of " "potential sORFs\n") for gff in os.listdir(args_sorf.gffs): if gff.endswith(".gff"): prefix = gff.replace(".gff", "") prefixs.append(prefix) print("Comparing transcripts and CDSs of {0}".format(prefix)) get_intergenic(os.path.join(args_sorf.gffs, gff), os.path.join(self.tran_path, "_".join([prefix, "transcript.gff"])), os.path.join(args_sorf.out_folder, "_".join([prefix, "inter.gff"])), args_sorf.utr_detect, args_sorf.hypo, args_sorf.extend_5, args_sorf.extend_3) log.write("\t" + os.path.join(args_sorf.out_folder, "_".join([prefix, "inter.gff"])) + " is generated to temporary store the sequences.\n") return prefixs def _re_table(self, args_sorf, prefixs, log): log.write("Running re_table.py for generating coverage information.\n") log.write("The following files are updated:\n") for type_ in ["all_candidates", "best_candidates"]: for prefix in prefixs: table_file = os.path.join(args_sorf.out_folder, "tables", type_, "_".join([ prefix, "sORF.csv"])) reorganize_table(args_sorf.libs, args_sorf.merge_wigs, "Track_detail", table_file) log.write("\t" + table_file + "\n") def run_sorf_detection(self, args_sorf, log): if args_sorf.fuzzy_rbs > 6: log.write("--fuzzy_rbs should be equal or less than 6!\n") print("Error: --fuzzy_rbs should be equal or less than 6!") sys.exit() self._check_necessary_files(args_sorf, log) self.multiparser.parser_gff(args_sorf.trans, "transcript") self.multiparser.combine_gff(args_sorf.gffs, self.tran_path, None, "transcript") self.multiparser.parser_fasta(args_sorf.fastas) self.multiparser.combine_fasta(args_sorf.gffs, self.fasta_path, None) prefixs = self._compare_tran_cds(args_sorf, log) self._start_stop_codon(prefixs, args_sorf, log) log.write("Running stat_sorf.py to do statistics.\n") for sorf in os.listdir(os.path.join(self.gff_output, self.all_cand)): print("Running statistics of {0}".format(sorf)) if sorf.endswith("_sORF.gff"): stat_file = os.path.join(args_sorf.out_folder, "statistics", "_".join(["stat", sorf.replace(".gff", ".csv")])) stat(os.path.join(self.gff_output, self.all_cand, sorf), os.path.join(self.gff_output, self.best, sorf), stat_file, args_sorf.utr_detect) log.write("\t" + stat_file + " is generated.\n") self._re_table(args_sorf, prefixs, log) self._remove_tmp(args_sorf)
class Ribos(object): def __init__(self, args_ribo): self.multiparser = Multiparser() self.helper = Helper() self.gff_parser = Gff3Parser() self.gff_path = os.path.join(args_ribo.gffs, "tmp") self.tss_path = os.path.join(args_ribo.tsss, "tmp") self.tran_path = os.path.join(args_ribo.trans, "tmp") self.fasta_path = os.path.join(args_ribo.fastas, "tmp") self.stat_folder = os.path.join(args_ribo.out_folder, "statistics") self.gff_outfolder = os.path.join(args_ribo.out_folder, "gffs") self.table_folder = os.path.join(args_ribo.out_folder, "tables") self.scan_folder = os.path.join(args_ribo.out_folder, "scan_Rfam") self.ribos_rfam = os.path.join(args_ribo.database, "Rfam_riboswitch.cm") self.tmp_files = {"fasta": os.path.join( args_ribo.out_folder, "tmp_fasta"), "scan": os.path.join( args_ribo.out_folder, "tmp_scan"), "table": os.path.join( args_ribo.out_folder, "tmp_table")} self.suffixs = {"csv": "riboswitch.csv", "txt": "riboswitch_prescan.txt", "re_txt": "riboswitch_scan.txt", "re_csv": "riboswitch_scan.csv"} def _run_infernal(self, args_ribo, seq, type_, prefix): scan_file = os.path.join(self.tmp_files["scan"], "_".join([prefix, self.suffixs[type_]])) scan = open(scan_file, "w") call([os.path.join(args_ribo.infernal_path, "cmscan"), "--incE", str(args_ribo.e_value), "--acc", self.ribos_rfam, seq], stdout=scan) scan.close() return scan_file def _scan_extract_rfam(self, prefixs, args_ribo): for gff in os.listdir(self.gff_path): if gff.endswith(".gff"): prefix = gff.replace(".gff", "") first_seq = os.path.join(self.tmp_files["fasta"], prefix + ".fa") prefixs.append(prefix) print("extracting seq of riboswitch candidates of {0}".format( prefix)) extract_potential_rbs( os.path.join(self.fasta_path, prefix + ".fa"), os.path.join(self.gff_path, gff), os.path.join(self.tss_path, prefix + "_TSS.gff"), os.path.join(self.tran_path, prefix + "_transcript.gff"), first_seq, args_ribo) print("pre-scanning of {0}".format(prefix)) first_scan_file = self._run_infernal(args_ribo, first_seq, "txt", prefix) sec_seq = os.path.join(self.tmp_files["fasta"], "_".join([prefix, "regenerate.fa"])) first_table = os.path.join( self.tmp_files["table"], "_".join([prefix, self.suffixs["csv"]])) regenerate_seq(first_scan_file, first_seq, first_table, sec_seq) print("scanning of {0}".format(prefix)) sec_scan_file = self._run_infernal(args_ribo, sec_seq, "re_txt", prefix) sec_table = os.path.join( self.tmp_files["table"], "_".join([prefix, self.suffixs["re_csv"]])) reextract_rbs(sec_scan_file, first_table, sec_table) shutil.move(sec_table, first_table) modify_table(first_table, args_ribo.output_all) return prefixs def _merge_results(self, args_ribo): for gff in os.listdir(args_ribo.gffs): if gff.endswith(".gff"): prefix = gff.replace(".gff", "") print("Merge results of {0}".format(prefix)) pre_strain = "" self.helper.check_make_folder(os.path.join( self.scan_folder, prefix)) fh = open(os.path.join(args_ribo.gffs, gff)) for entry in self.gff_parser.entries(fh): if entry.seq_id != pre_strain: if len(pre_strain) == 0: shutil.copyfile(os.path.join( self.tmp_files["table"], "_".join([entry.seq_id, self.suffixs["csv"]])), os.path.join( self.table_folder, "_".join([prefix, self.suffixs["csv"]]))) else: self.helper.merge_file(os.path.join( self.tmp_files["table"], "_".join([entry.seq_id, self.suffixs["csv"]])), os.path.join( self.table_folder, "_".join([prefix, self.suffixs["csv"]]))) shutil.copy(os.path.join( self.tmp_files["scan"], "_".join([entry.seq_id, self.suffixs["txt"]])), os.path.join(self.scan_folder, prefix)) shutil.copy(os.path.join( self.tmp_files["scan"], "_".join([entry.seq_id, self.suffixs["re_txt"]])), os.path.join(self.scan_folder, prefix)) pre_strain = entry.seq_id out_stat = os.path.join( self.stat_folder, "_".join(["stat", prefix, "riboswitch.txt"])) print("compute statistics of {0}".format(prefix)) stat_and_covert2gff(os.path.join( self.table_folder, "_".join([prefix, self.suffixs["csv"]])), args_ribo.ribos_id, os.path.join( self.gff_outfolder, "_".join([prefix, "riboswitch.gff"])), args_ribo.fuzzy, out_stat) fh.close() def _remove_tmp(self, args_ribo): self.helper.remove_tmp(args_ribo.gffs) self.helper.remove_tmp(args_ribo.fastas) self.helper.remove_all_content(args_ribo.out_folder, "tmp", "dir") def _remove_overlap(self, gff_path): for gff in os.listdir(gff_path): if gff.endswith(".gff"): rbs_overlap( os.path.join(os.path.join( self.tmp_files["table"], "_".join([gff.replace(".gff", ""), self.suffixs["csv"]]))), os.path.join(gff_path, gff)) def run_ribos(self, args_ribo): if args_ribo.fuzzy_rbs > 6: print("Error: --fuzzy_rbs should be equal or less than 6!!") sys.exit() self.multiparser.parser_gff(args_ribo.gffs, None) self.multiparser.parser_fasta(args_ribo.fastas) self.multiparser.parser_gff(args_ribo.trans, "transcript") self.multiparser.parser_gff(args_ribo.tsss, "TSS") for gff in os.listdir(args_ribo.gffs): if gff.endswith(".gff"): self.helper.check_uni_attributes(os.path.join( args_ribo.gffs, gff)) rbs_from_rfam(args_ribo.ribos_id, args_ribo.rfam, self.ribos_rfam) print("compressing Rfam...") call([os.path.join(args_ribo.infernal_path, "cmpress"), "-F", self.ribos_rfam]) prefixs = [] self.helper.check_make_folder(self.tmp_files["fasta"]) self.helper.check_make_folder(self.tmp_files["scan"]) self.helper.check_make_folder(self.tmp_files["table"]) prefixs = self._scan_extract_rfam(prefixs, args_ribo) self._remove_overlap(self.gff_path) self._merge_results(args_ribo) mapping_ribos(self.table_folder, args_ribo.ribos_id) self._remove_tmp(args_ribo)