def __init__(self, args_ratt): self.multiparser = Multiparser() self.converter = Converter() self.format_fixer = FormatFixer() self.helper = Helper() self.gbk = os.path.join(args_ratt.ref_embls, "gbk_tmp") self.gbk_tmp = os.path.join(self.gbk, "tmp") self.embl = os.path.join(args_ratt.ref_embls, "embls") self.ratt_log = os.path.join(args_ratt.output_path, "ratt_log.txt") self.tmp_files = { "tar": os.path.join(args_ratt.tar_fastas, "tmp"), "ref": os.path.join(args_ratt.ref_fastas, "tmp"), "out_gff": os.path.join(args_ratt.gff_outfolder, "tmp"), "gff": os.path.join(args_ratt.gff_outfolder, "tmp.gff"), "ptt": os.path.join(args_ratt.gff_outfolder, "tmp.ptt"), "rnt": os.path.join(args_ratt.gff_outfolder, "tmp.rnt") }
def __init__(self, args_tar): self.multiparser = Multiparser() self.helper = Helper() self.fixer = FormatFixer() self.gff_parser = Gff3Parser() self.target_seq_path = os.path.join(args_tar.out_folder, "target_seqs") self.srna_seq_path = os.path.join(args_tar.out_folder, "sRNA_seqs") self.rnaplex_path = os.path.join(args_tar.out_folder, "RNAplex_results") self.rnaup_path = os.path.join(args_tar.out_folder, "RNAup_results") self.intarna_path = os.path.join(args_tar.out_folder, "IntaRNA_results") self.merge_path = os.path.join(args_tar.out_folder, "merged_results") self.srna_path = os.path.join(args_tar.srnas, "tmp") self.fasta_path = os.path.join(args_tar.fastas, "tmp") self.gff_path = os.path.join(args_tar.gffs, "tmp") self.tmps = {"tmp": "tmp_srna_target", "rnaup": "tmp_rnaup", "log": "tmp_log", "all_fa": "tmp*.fa", "all_txt": "tmp*.txt"}
def setUp(self): self.fixer = FormatFixer() self.example = Example() self.ratt_out = self.example.ratt_out self.rnaplex_out = self.example.rnaplex_out self.emboss_out = self.example.emboss_out self.test_folder = "test_folder" if (not os.path.exists(self.test_folder)): os.mkdir(self.test_folder) self.ratt_file = os.path.join(self.test_folder, "ratt.gff") with open(self.ratt_file, "w") as rh: rh.write(self.example.ratt_gff) self.rnaplex_file = os.path.join(self.test_folder, "rnaplex.txt") with open(self.rnaplex_file, "w") as rh: rh.write(self.example.rnaplex_file) self.emboss_file = os.path.join(self.test_folder, "emboss.txt") with open(self.emboss_file, "w") as rh: rh.write(self.example.emboss_file)
def __init__(self, args_sub): self.multiparser = Multiparser() self.helper = Helper() self.fixer = FormatFixer() self.gff_path = os.path.join(args_sub.gffs, "tmp") self.fasta_path = os.path.join(args_sub.fastas, "tmp") if args_sub.trans is not None: self.tran_path = os.path.join(args_sub.trans, "tmp") else: self.tran_path = None self.out_all = os.path.join(args_sub.out_folder, "all_CDS") self.out_express = os.path.join(args_sub.out_folder, "expressed_CDS") self.all_tmp_path = os.path.join(self.out_all, "tmp") self.express_tmp_path = os.path.join(self.out_express, "tmp") self.all_stat_path = os.path.join(self.out_all, "statistics") self.express_stat_path = os.path.join(self.out_express, "statistics") self.all_tmp_result = os.path.join(self.out_all, "tmp_results") self.express_tmp_result = os.path.join(self.out_express, "tmp_results") self.all_result = os.path.join(self.out_all, "psortb_results") self.express_result = os.path.join(self.out_express, "psortb_results") self.endfix_table = "table.csv" self.endfix_raw = "raw.txt" self._make_folder()
def __init__(self, args_tar): self.multiparser = Multiparser() self.helper = Helper() self.fixer = FormatFixer() self.gff_parser = Gff3Parser() self.target_seq_path = os.path.join(args_tar.out_folder, "target_seqs") self.srna_seq_path = os.path.join(args_tar.out_folder, "sRNA_seqs") self.rnaplex_path = os.path.join(args_tar.out_folder, "RNAplex") self.rnaup_path = os.path.join(args_tar.out_folder, "RNAup") self.merge_path = os.path.join(args_tar.out_folder, "merge") self.srna_path = os.path.join(args_tar.srnas, "tmp") self.fasta_path = os.path.join(args_tar.fastas, "tmp") self.gff_path = os.path.join(args_tar.gffs, "tmp") self.tmps = {"tmp": "tmp", "rnaup": "tmp_rnaup", "log": "tmp_log", "all_fa": "tmp*.fa", "all_txt": "tmp*.txt"}
def __init__(self, args_ratt): self.multiparser = Multiparser() self.converter = Converter() self.format_fixer = FormatFixer() self.helper = Helper() self.gbk = os.path.join(args_ratt.ref_embls, "gbk_tmp") self.gbk_tmp = os.path.join(self.gbk, "tmp") self.embl = os.path.join(args_ratt.ref_embls, "embls") self.ratt_log = os.path.join(args_ratt.output_path, "ratt_log.txt") self.tmp_files = {"tar": os.path.join(args_ratt.tar_fastas, "tmp"), "ref": os.path.join(args_ratt.ref_fastas, "tmp"), "out_gff": os.path.join(args_ratt.gff_outfolder, "tmp"), "gff": os.path.join(args_ratt.gff_outfolder, "tmp.gff"), "ptt": os.path.join(args_ratt.gff_outfolder, "tmp.ptt"), "rnt": os.path.join(args_ratt.gff_outfolder, "tmp.rnt")}
class TestFormatFixer(unittest.TestCase): def setUp(self): self.fixer = FormatFixer() self.example = Example() self.ratt_out = self.example.ratt_out self.rnaplex_out = self.example.rnaplex_out self.emboss_out = self.example.emboss_out self.test_folder = "test_folder" if (not os.path.exists(self.test_folder)): os.mkdir(self.test_folder) self.ratt_file = os.path.join(self.test_folder, "ratt.gff") with open(self.ratt_file, "w") as rh: rh.write(self.example.ratt_gff) self.rnaplex_file = os.path.join(self.test_folder, "rnaplex.txt") with open(self.rnaplex_file, "w") as rh: rh.write(self.example.rnaplex_file) self.emboss_file = os.path.join(self.test_folder, "emboss.txt") with open(self.emboss_file, "w") as rh: rh.write(self.example.emboss_file) def tearDown(self): if os.path.exists(self.test_folder): shutil.rmtree(self.test_folder) def test_fix_ratt(self): out = os.path.join(self.test_folder, "ratt.out") self.fixer.fix_ratt(self.ratt_file, "Staphylococcus_aureus_HG003", out) datas = import_data(out) self.assertEqual(set(datas), set(self.ratt_out.split("\n"))) def test_fix_rnaplex(self): out_file = os.path.join(self.test_folder, "rnaplex.out") self.fixer.fix_rnaplex(self.rnaplex_file, out_file) datas = import_data(out_file) self.assertEqual(set(datas), set(self.rnaplex_out.split("\n"))) def test_fix_emboss(self): out_file = os.path.join(self.test_folder, "emboss.out") self.fixer.fix_emboss(self.emboss_file, out_file) datas = import_data(out_file) self.assertEqual(set(datas), set(self.emboss_out.split("\n")))
class RATT(object): def __init__(self, args_ratt): self.multiparser = Multiparser() self.converter = Converter() self.format_fixer = FormatFixer() self.helper = Helper() self.gbk = os.path.join(args_ratt.ref_embls, "gbk_tmp") self.gbk_tmp = os.path.join(self.gbk, "tmp") self.embl = os.path.join(args_ratt.ref_embls, "embls") self.ratt_log = os.path.join(args_ratt.output_path, "ratt_log.txt") self.tmp_files = {"tar": os.path.join(args_ratt.tar_fastas, "tmp"), "ref": os.path.join(args_ratt.ref_fastas, "tmp"), "out_gff": os.path.join(args_ratt.gff_outfolder, "tmp"), "gff": os.path.join(args_ratt.gff_outfolder, "tmp.gff"), "ptt": os.path.join(args_ratt.gff_outfolder, "tmp.ptt"), "rnt": os.path.join(args_ratt.gff_outfolder, "tmp.rnt")} def _convert_to_pttrnt(self, gffs, files): for gff in files: if gff.endswith(".gff"): gff = os.path.join(gffs, gff) filename = gff.split("/") prefix = filename[-1][:-4] rnt = gff[:-3] + "rnt" ptt = gff[:-3] + "ptt" fasta = self.helper.get_correct_file(self.tmp_files["tar"], ".fa", prefix, None, None) if fasta: self.converter.convert_gff2rntptt(gff, fasta, ptt, rnt, None, None) def _remove_files(self, args_ratt, out_gbk): self.helper.remove_all_content(args_ratt.gff_outfolder, ".gff", "file") self.helper.remove_all_content(args_ratt.gff_outfolder, ".ptt", "file") self.helper.remove_all_content(args_ratt.gff_outfolder, ".rnt", "file") self.helper.move_all_content(self.tmp_files["out_gff"], args_ratt.gff_outfolder, None) shutil.rmtree(self.tmp_files["out_gff"]) shutil.rmtree(self.tmp_files["tar"]) shutil.rmtree(self.tmp_files["ref"]) shutil.rmtree(self.embl) self.helper.remove_all_content(args_ratt.tar_fastas, "_folder", "dir") self.helper.remove_all_content(args_ratt.ref_fastas, "_folder", "dir") if out_gbk: shutil.rmtree(out_gbk) def _convert_to_gff(self, ratt_result, args_ratt, files): name = ratt_result.split(".") filename = ".".join(name[1:-2]) + ".gff" output_file = os.path.join(args_ratt.output_path, filename) self.converter.convert_embl2gff( os.path.join(args_ratt.output_path, ratt_result), output_file) self.format_fixer.fix_ratt(output_file, ".".join(name[1:-2]), "tmp_gff") shutil.move("tmp_gff", output_file) shutil.copy(output_file, os.path.join(args_ratt.gff_outfolder, filename)) files.append(filename) def _parser_embl_gbk(self, files): self.helper.check_make_folder(self.gbk) for file_ in files: close = False with open(file_, "r") as f_h: for line in f_h: if (line.startswith("LOCUS")): out = open(self.gbk_tmp, "w") datas = line.split(" ") for data in datas: if (len(data) != 0) and (data != "LOCUS"): filename = ".".join([data, "gbk"]) break elif (line.startswith("VERSION")): datas = line.split(" ") for data in datas: if (len(data) != 0) and (data != "VERSION"): new_filename = ".".join([data, "gbk"]) break if new_filename.find(filename): filename = new_filename if out: out.write(line) if line.startswith("//"): out.close() close = True shutil.move(self.gbk_tmp, os.path.join(self.gbk, filename)) if not close: out.close() return self.gbk def _convert_embl(self, ref_embls): detect_gbk = False gbks = [] out_gbk = None for embl in os.listdir(ref_embls): if embl.endswith(".gbk"): detect_gbk = True gbks.append(os.path.join(ref_embls, embl)) if not detect_gbk: print("Error: please assign proper folder for Genebank file!!!") sys.exit() elif detect_gbk: out_gbk = self._parser_embl_gbk(gbks) self.converter.convert_gbk2embl(out_gbk) self.helper.check_make_folder(self.embl) self.helper.move_all_content(out_gbk, self.embl, [".embl"]) return out_gbk def _run_ratt(self, args_ratt, tar, ref, out): call([args_ratt.ratt_path, self.embl, os.path.join(self.tmp_files["tar"], tar + ".fa"), args_ratt.element, args_ratt.transfer_type, os.path.join(self.tmp_files["ref"], ref + ".fa")], stdout=out, stderr=DEVNULL) def _format_and_run(self, args_ratt): print("Running RATT...") for pair in args_ratt.pairs: ref = pair.split(":")[0] tar = pair.split(":")[1] out = open(self.ratt_log, "w+") print(tar) self._run_ratt(args_ratt, tar, ref, out) for filename in os.listdir(): if ("final" in filename): shutil.move(filename, os.path.join(args_ratt.output_path, filename)) elif (args_ratt.element in filename) or ( "query" in filename) or ( "Reference" in filename) or ( "Query" in filename) or ( "Sequences" in filename): if os.path.isfile(filename): os.remove(filename) if os.path.isdir(filename): shutil.rmtree(filename) out.close() def annotation_transfer(self, args_ratt): self.multiparser.parser_fasta(args_ratt.tar_fastas) self.multiparser.parser_fasta(args_ratt.ref_fastas) out_gbk = self._convert_embl(args_ratt.ref_embls) self._format_and_run(args_ratt) if args_ratt.convert: files = [] for data in os.listdir(args_ratt.output_path): if "final.embl" in data: self._convert_to_gff(data, args_ratt, files) self._convert_to_pttrnt(args_ratt.gff_outfolder, files) self.helper.check_make_folder(self.tmp_files["out_gff"]) for folder in os.listdir(args_ratt.tar_fastas): files = [] if "_folder" in folder: datas = folder.split("_folder") prefix = datas[0][:-3] for file_ in os.listdir(os.path.join(args_ratt.tar_fastas, folder)): files.append(file_[:-3]) for gff in os.listdir(args_ratt.gff_outfolder): for file_ in files: if (".gff" in gff) and (file_ == gff[:-4]): self.helper.merge_file(os.path.join( args_ratt.gff_outfolder, gff), self.tmp_files["gff"]) if (".ptt" in gff) and (file_ == gff[:-4]): self.helper.merge_file(os.path.join( args_ratt.gff_outfolder, gff), self.tmp_files["ptt"]) if (".rnt" in gff) and (file_ == gff[:-4]): self.helper.merge_file(os.path.join( args_ratt.gff_outfolder, gff), self.tmp_files["rnt"]) shutil.move(self.tmp_files["gff"], os.path.join( self.tmp_files["out_gff"], prefix + ".gff")) shutil.move(self.tmp_files["ptt"], os.path.join( self.tmp_files["out_gff"], prefix + ".ptt")) shutil.move(self.tmp_files["rnt"], os.path.join( self.tmp_files["out_gff"], prefix + ".rnt")) self._remove_files(args_ratt, out_gbk)
class sRNATargetPrediction(object): '''detection of sRNA-target interaction''' def __init__(self, args_tar): self.multiparser = Multiparser() self.helper = Helper() self.fixer = FormatFixer() self.gff_parser = Gff3Parser() self.target_seq_path = os.path.join(args_tar.out_folder, "target_seqs") self.srna_seq_path = os.path.join(args_tar.out_folder, "sRNA_seqs") self.rnaplex_path = os.path.join(args_tar.out_folder, "RNAplex_results") self.rnaup_path = os.path.join(args_tar.out_folder, "RNAup_results") self.intarna_path = os.path.join(args_tar.out_folder, "IntaRNA_results") self.merge_path = os.path.join(args_tar.out_folder, "merged_results") self.srna_path = os.path.join(args_tar.srnas, "tmp") self.fasta_path = os.path.join(args_tar.fastas, "tmp") self.gff_path = os.path.join(args_tar.gffs, "tmp") self.tmps = {"tmp": "tmp_srna_target", "rnaup": "tmp_rnaup", "log": "tmp_log", "all_fa": "tmp*.fa", "all_txt": "tmp*.txt"} def _check_gff(self, gffs): for gff in os.listdir(gffs): if gff.endswith(".gff"): self.helper.check_uni_attributes(os.path.join(gffs, gff)) def _run_rnaplfold(self, rnaplfold_path, file_type, win_size, span, unstr_region, seq_path, prefix, out_path, log): current = os.getcwd() os.chdir(out_path) command = " ".join([rnaplfold_path, "-W", str(win_size), "-L", str(span), "-u", str(unstr_region), "-O"]) if file_type == "sRNA": log.write("<".join([command, os.path.join(current, seq_path, "_".join([self.tmps["tmp"], prefix, file_type + ".fa"]))]) + "\n") os.system("<".join([command, os.path.join(current, seq_path, "_".join([self.tmps["tmp"], prefix, file_type + ".fa"]))])) else: log.write("<".join([command, os.path.join(current, seq_path, "_".join([prefix, file_type + ".fa"]))]) + "\n") os.system("<".join([command, os.path.join(current, seq_path, "_".join([prefix, file_type + ".fa"]))])) os.chdir(current) def _wait_process(self, processes): for p in processes: p.wait() if p.stdout: p.stdout.close() if p.stdin: p.stdin.close() if p.stderr: p.stderr.close() try: p.kill() except OSError: pass time.sleep(5) def _sort_srna_fasta(self, fasta, prefix, path): out = open(os.path.join(path, "_".join([self.tmps["tmp"], prefix, "sRNA.fa"])), "w") srnas = [] with open(fasta) as f_h: for line in f_h: line = line.strip() if line.startswith(">"): name = line[1:] else: srnas.append({"name": name, "seq": line, "len": len(line)}) srnas = sorted(srnas, key=lambda x: (x["len"])) for srna in srnas: out.write(">" + srna["name"].split("|")[0] + "\n") out.write(srna["seq"] + "\n") out.close() def _read_fasta(self, fasta_file): seq = "" with open(fasta_file, "r") as seq_f: for line in seq_f: line = line.strip() if line.startswith(">"): continue else: seq = seq + line return seq def _get_specific_seq(self, srna_file, seq_file, srna_out, querys): for query in querys: srna_datas = query.split(":") srna = {"seq_id": srna_datas[0], "strand": srna_datas[3], "start": int(srna_datas[1]), "end": int(srna_datas[2])} gff_f = open(srna_file, "r") out = open(srna_out, "a") seq = self._read_fasta(seq_file) num = 0 detect = False for entry in self.gff_parser.entries(gff_f): if (entry.seq_id == srna["seq_id"]) and ( entry.strand == srna["strand"]) and ( entry.start == srna["start"]) and ( entry.end == srna["end"]): detect = True if "ID" in entry.attributes.keys(): id_ = entry.attributes["ID"] else: id_ = entry.feature + str(num) gene = self.helper.extract_gene(seq, entry.start, entry.end, entry.strand) out.write(">{0}|{1}|{2}|{3}|{4}\n{5}\n".format( id_, entry.seq_id, entry.start, entry.end, entry.strand, gene)) num += 1 if not detect: print("Error: Some of the query sRNAs do not exist!") sys.exit() gff_f.close() out.close() def _gen_seq(self, prefixs, args_tar): print("Generating sRNA fasta files") for srna in os.listdir(self.srna_path): if srna.endswith("_sRNA.gff"): prefix = srna.replace("_sRNA.gff", "") prefixs.append(prefix) srna_out = os.path.join(self.srna_seq_path, "_".join([prefix, "sRNA.fa"])) if "all" in args_tar.query: self.helper.get_seq( os.path.join(self.srna_path, srna), os.path.join(self.fasta_path, prefix + ".fa"), srna_out) else: if "_".join([prefix, "sRNA.fa"]) in os.listdir( self.srna_seq_path): os.remove(srna_out) self._get_specific_seq( os.path.join(self.srna_path, srna), os.path.join(self.fasta_path, prefix + ".fa"), srna_out, args_tar.query) self._sort_srna_fasta(srna_out, prefix, self.srna_seq_path) print("Generating target fasta files") for gff in os.listdir(self.gff_path): if gff.endswith(".gff"): prefix = gff.replace(".gff", "") potential_target(os.path.join(self.gff_path, gff), os.path.join(self.fasta_path, prefix + ".fa"), os.path.join(self.target_seq_path), args_tar) file_num = 1 num = 0 sub_prefix = os.path.join(self.target_seq_path, "_".join([prefix, "target"])) sub_out = open("_".join([sub_prefix, str(file_num) + ".fa"]), "w") with open((sub_prefix + ".fa"), "r") as t_f: for line in t_f: line = line.strip() if line.startswith(">"): # line = line.replace("|", "_") num += 1 if (num == 100): num = 0 file_num += 1 sub_out.close() sub_out = open("_".join([sub_prefix, str(file_num) + ".fa"]), "w") sub_out.write(line + "\n") sub_out.close() def _run_rnaplex(self, prefix, rnaplfold_folder, args_tar, log): print("Running RNAplex of {0}".format(prefix)) num_process = 0 processes = [] for seq in os.listdir(self.target_seq_path): if (prefix in seq) and ("_target_" in seq): print("Running RNAplex with {0}".format(seq)) out_rnaplex = open(os.path.join( self.rnaplex_path, prefix, "_".join([ prefix, "RNAplex", str(num_process) + ".txt"])), "w") num_process += 1 log.write(" ".join([args_tar.rnaplex_path, "-q", os.path.join( self.srna_seq_path, "_".join([ self.tmps["tmp"], prefix, "sRNA.fa"])), "-t", os.path.join(self.target_seq_path, seq), "-l", str(args_tar.inter_length), "-e", str(args_tar.energy), "-z", str(args_tar.duplex_dist), "-a", rnaplfold_folder]) + "\n") p = Popen([args_tar.rnaplex_path, "-q", os.path.join( self.srna_seq_path, "_".join([ self.tmps["tmp"], prefix, "sRNA.fa"])), "-t", os.path.join(self.target_seq_path, seq), "-l", str(args_tar.inter_length), "-e", str(args_tar.energy), "-z", str(args_tar.duplex_dist), "-a", rnaplfold_folder], stdout=out_rnaplex) processes.append(p) if num_process % args_tar.core_plex == 0: self._wait_process(processes) self._wait_process(processes) log.write("The prediction for {0} is done.\n".format(prefix)) log.write("The following temporary files for storing results of {0} are " "generated:\n".format(prefix)) for file_ in os.listdir(os.path.join(self.rnaplex_path, prefix)): log.write("\t" + os.path.join(self.rnaplex_path, prefix, file_) + "\n") return num_process def _rna_plex(self, prefixs, args_tar, log): log.write("Using RNAplex and RNAplfold to predict sRNA targets.\n") log.write("Please make sure the version of Vienna RNA package is " "at least 2.3.2.\n") for prefix in prefixs: print("Running RNAplfold of {0}".format(prefix)) self.helper.check_make_folder( os.path.join(self.rnaplex_path, prefix)) rnaplfold_folder = os.path.join(self.rnaplex_path, prefix, "RNAplfold") os.mkdir(rnaplfold_folder) self._run_rnaplfold( args_tar.rnaplfold_path, "sRNA", args_tar.win_size_s, args_tar.span_s, args_tar.unstr_region_rnaplex_s, self.srna_seq_path, prefix, rnaplfold_folder, log) self._run_rnaplfold( args_tar.rnaplfold_path, "target", args_tar.win_size_t, args_tar.span_t, args_tar.unstr_region_rnaplex_t, self.target_seq_path, prefix, rnaplfold_folder, log) num_process = self._run_rnaplex(prefix, rnaplfold_folder, args_tar, log) rnaplex_file = os.path.join(self.rnaplex_path, prefix, "_".join([prefix, "RNAplex.txt"])) if ("_".join([prefix, "RNAplex.txt"]) in os.listdir(os.path.join(self.rnaplex_path, prefix))): os.remove(rnaplex_file) for index in range(0, num_process): log.write("Using helper.py to merge the temporary files.\n") self.helper.merge_file(os.path.join( self.rnaplex_path, prefix, "_".join([ prefix, "RNAplex", str(index) + ".txt"])), rnaplex_file) log.write("\t" + rnaplex_file + " is generated.\n") self.helper.remove_all_content(os.path.join( self.rnaplex_path, prefix), "_RNAplex_", "file") self.fixer.fix_rnaplex(rnaplex_file, self.tmps["tmp"]) shutil.move(self.tmps["tmp"], rnaplex_file) shutil.rmtree(rnaplfold_folder) def _run_rnaup(self, num_up, processes, prefix, out_rnaup, out_log, args_tar, log): for index in range(1, num_up + 1): out_tmp_up = open(os.path.join( args_tar.out_folder, "".join([self.tmps["rnaup"], str(index), ".txt"])), "w") out_err = open(os.path.join( args_tar.out_folder, "".join([self.tmps["log"], str(index), ".txt"])), "w") in_up = open(os.path.join( args_tar.out_folder, "".join([self.tmps["tmp"], str(index), ".fa"])), "r") log.write(" ".join([args_tar.rnaup_path, "-u", str(args_tar.unstr_region_rnaup), "-o", "--interaction_first"]) + "\n") p = Popen([args_tar.rnaup_path, "-u", str(args_tar.unstr_region_rnaup), "-o", "--interaction_first"], stdin=in_up, stdout=out_tmp_up, stderr=out_err) processes.append(p) if len(processes) != 0: time.sleep(5) self._wait_process(processes) log.write("The following temporary files for storing results of {0} are " "generated:\n".format(prefix)) for file_ in os.listdir(os.path.join(args_tar.out_folder)): log.write("\t" + os.path.join(args_tar.out_folder, file_) + "\n") os.system("rm " + os.path.join(args_tar.out_folder, self.tmps["all_fa"])) self._merge_txt(num_up, out_rnaup, out_log, args_tar.out_folder) os.system("rm " + os.path.join(args_tar.out_folder, self.tmps["all_txt"])) def _merge_txt(self, num_up, out_rnaup, out_log, out_folder): for index in range(1, num_up + 1): self.helper.merge_file( os.path.join(out_folder, "".join([self.tmps["rnaup"], str(index), ".txt"])), out_rnaup) self.helper.merge_file( os.path.join(out_folder, "".join([self.tmps["log"], str(index), ".txt"])), out_log) def _get_continue(self, out_rnaup): '''For RNAup, it can continue running RNAup based on previous run''' srnas = [] matchs = {} out = open("tmp.txt", "w") with open(out_rnaup) as f_h: for line in f_h: line = line.strip() if ">srna" in line: srna = line[1:] srnas.append(srna) matchs[srna] = [] else: matchs[srna].append(line) srnas = srnas[:-1] for srna in srnas: out.write(">" + srna + "\n") for target in matchs[srna]: out.write(target + "\n") out.close() os.remove(out_rnaup) shutil.move("tmp.txt", out_rnaup) return srnas def _rnaup(self, prefixs, args_tar, log): log.write("Using RNAup to predict sRNA targets.\n") log.write("Please make sure the version of Vienna RNA package is " "at least 2.3.2.\n") for prefix in prefixs: srnas = [] print("Running RNAup of {0}".format(prefix)) if not os.path.exists(os.path.join(self.rnaup_path, prefix)): os.mkdir(os.path.join(self.rnaup_path, prefix)) num_up = 0 processes = [] out_rnaup = os.path.join(self.rnaup_path, prefix, "_".join([prefix + "_RNAup.txt"])) out_log = os.path.join(self.rnaup_path, prefix, "_".join([prefix + "_RNAup.log"])) if "_".join([prefix, "RNAup.txt"]) in \ os.listdir(os.path.join(self.rnaup_path, prefix)): if not args_tar.continue_rnaup: os.remove(out_rnaup) os.remove(out_log) else: log.write("The data from the previous run is found.\n") srnas = self._get_continue(out_rnaup) log.write("The previous data is loaded.\n") with open(os.path.join(self.srna_seq_path, "_".join([ self.tmps["tmp"], prefix, "sRNA.fa"])), "r") as s_f: for line in s_f: line = line.strip() if line.startswith(">"): if line[1:] in srnas: start = False continue start = True print("Running RNAup with {0}".format(line[1:])) num_up += 1 out_up = open(os.path.join(args_tar.out_folder, "".join([self.tmps["tmp"], str(num_up), ".fa"])), "w") out_up.write(line + "\n") else: if start: out_up.write(line + "\n") out_up.close() self.helper.merge_file(os.path.join( self.target_seq_path, "_".join([prefix, "target.fa"])), os.path.join(args_tar.out_folder, "".join([self.tmps["tmp"], str(num_up), ".fa"]))) if num_up == args_tar.core_up: self._run_rnaup(num_up, processes, prefix, out_rnaup, out_log, args_tar, log) processes = [] num_up = 0 self._run_rnaup(num_up, processes, prefix, out_rnaup, out_log, args_tar, log) log.write("The prediction for {0} is done.\n".format(prefix)) log.write("\t" + out_rnaup + " is complete generated and updated.\n") def _intarna(self, prefixs, args_tar, log): log.write("Using IntaRNA to predict sRNA targets.\n") log.write("Please make sure the version of IntaRNA is at least 2.0.4.\n") for prefix in prefixs: print("Running IntaRNA of {0}".format(prefix)) intarna_file = os.path.join(self.intarna_path, prefix, prefix + "_IntaRNA.txt") self.helper.check_make_folder( os.path.join(self.intarna_path, prefix)) call([args_tar.intarna_path, "-q", os.path.join( self.srna_seq_path, "_".join([ self.tmps["tmp"], prefix, "sRNA.fa"])), "-t", os.path.join(self.target_seq_path, prefix + "_target.fa"), "--qAccW", str(args_tar.slide_win_srna), "--qAccL", str(args_tar.max_loop_srna), "--tAccW", str(args_tar.slide_win_target), "--tAccL", str(args_tar.max_loop_target), "--outMode", "C", "-m", args_tar.mode_intarna, "--threads", str(args_tar.core_inta), "--out", intarna_file]) log.write("The prediction for {0} is done.\n".format(prefix)) log.write("\t" + intarna_file + " is generated.\n") def _merge_rnaplex_rnaup(self, prefixs, args_tar, log): '''merge the result of IntaRNA, RNAup and RNAplex''' log.write("Running merge_rnaplex_rnaup.py to merge the results from " "RNAplex, RNAup, and IntaRNA for generating finanl output.\n") log.write("The following files are generated:\n") for prefix in prefixs: rnaplex_file = None rnaup_file = None out_rnaplex = None out_rnaup = None intarna_file = None out_intarna = None self.helper.check_make_folder(os.path.join( self.merge_path, prefix)) print("Ranking {0} now".format(prefix)) if ("RNAplex" in args_tar.program): rnaplex_file = os.path.join(self.rnaplex_path, prefix, "_".join([prefix, "RNAplex.txt"])) out_rnaplex = os.path.join( self.rnaplex_path, prefix, "_".join([prefix, "RNAplex_rank.csv"])) self._remove_repeat(rnaplex_file, "RNAplex") if ("RNAup" in args_tar.program): rnaup_file = os.path.join(self.rnaup_path, prefix, "_".join([prefix, "RNAup.txt"])) out_rnaup = os.path.join(self.rnaup_path, prefix, "_".join([prefix, "RNAup_rank.csv"])) self._remove_repeat(rnaup_file, "RNAup") if ("IntaRNA" in args_tar.program): intarna_file = os.path.join(self.intarna_path, prefix, "_".join([prefix, "IntaRNA.txt"])) out_intarna = os.path.join(self.intarna_path, prefix, "_".join([prefix, "IntaRNA_rank.csv"])) self._remove_repeat(intarna_file, "IntaRNA") overlap_file = os.path.join(self.merge_path, prefix, "_".join([prefix, "overlap.csv"])) merge_file = os.path.join(self.merge_path, prefix, "_".join([prefix, "merge.csv"])) merge_srna_target(rnaplex_file, rnaup_file, intarna_file, args_tar, out_rnaplex, out_rnaup, out_intarna, os.path.join(self.fasta_path, prefix + ".fa"), merge_file, overlap_file, os.path.join(self.srna_path, "_".join([prefix, "sRNA.gff"])), os.path.join(self.gff_path, prefix + ".gff")) if ("RNAplex" in args_tar.program): log.write("\t" + out_rnaplex + "\n") if ("RNAup" in args_tar.program): log.write("\t" + out_rnaup + "\n") if ("IntaRNA" in args_tar.program): log.write("\t" + out_intarna + "\n") if (os.path.exists(merge_file)): log.write("\t" + merge_file + "\n") if (os.path.exists(overlap_file)): log.write("\t" + overlap_file + "\n") def _remove_rnaplex(self, line, num, pre_num, pre, checks, out_tmp, print_): if (line.startswith(">")): if (num % 2 == 1): print_ = False pre = line if (line not in checks): checks[line] = [] print_ = True elif (num % 2 == 0) and (line not in checks[pre]): checks[pre].append(line) print_ = True num = num + 1 else: if (print_): if (num != pre_num): out_tmp.write(pre + "\n") out_tmp.write(checks[pre][-1] + "\n") out_tmp.write(line + "\n") pre_num = num return num, pre_num, print_, pre, def _remove_rnaup(self, line, pre, num, pre_num, srna_info, checks, out_tmp, print_, tar): if (line.startswith(">")): print_ = False tar = False if (pre.startswith(">")): if (pre not in checks): checks[pre] = [line] srna_info = pre print_ = True else: if (line not in checks[pre]): checks[pre].append(line) print_ = True else: if (num != 1): if (line not in checks[srna_info]): checks[srna_info].append(line) print_ = True else: if (print_): if (pre_num != len(checks)): out_tmp.write(srna_info + "\n") out_tmp.write(checks[srna_info][-1] + "\n") out_tmp.write(line + "\n") else: if (not tar): out_tmp.write(checks[srna_info][-1] + "\n") out_tmp.write(line + "\n") pre_num = len(checks) tar = True pre = line num = num + 1 return num, pre_num, print_, pre, tar, srna_info def _remove_intarna(self, line, checks, tar, srna_info, seq, out_tmp): if (line.startswith(".")) or ( line.startswith("(")) or ( line.startswith(")")): seq = line.split(";")[0] if (seq not in checks[tar][srna_info]): checks[tar][srna_info].append(seq) out_tmp.write(line + "\n") else: if (len(line.split(";")) >= 8): tar = line.split(";")[0] srna_info = line.split(";")[3] seq = line.split(";")[7] if (tar not in checks): checks[tar] = {} checks[tar][srna_info] = [seq] out_tmp.write(line + "\n") else: if (srna_info not in checks[tar]): checks[tar][srna_info] = [seq] out_tmp.write(line + "\n") return tar, srna_info, seq def _remove_repeat(self, interact_file, type_): checks = {} seq = "" pre = "" srna_info = "" num = 1 tar = False pre_num = 0 print_ = False out_tmp = open(interact_file + "tmp", "w") with open(interact_file) as fh: for line in fh: line = line.strip() if (type_ == "RNAplex"): num, pre_num, print_, pre = self._remove_rnaplex( line, num, pre_num, pre, checks, out_tmp, print_) elif (type_ == "RNAup"): num, pre_num, print_, pre, tar, srna_info = ( self._remove_rnaup( line, pre, num, pre_num, srna_info, checks, out_tmp, print_, tar)) elif (type_ == "IntaRNA"): tar, srna_info, seq = self._remove_intarna( line, checks, tar, srna_info, seq, out_tmp) out_tmp.close() shutil.move(interact_file + "tmp", interact_file) def run_srna_target_prediction(self, args_tar, log): self._check_gff(args_tar.gffs) self._check_gff(args_tar.srnas) self.multiparser.parser_gff(args_tar.gffs, None) self.multiparser.parser_fasta(args_tar.fastas) self.multiparser.parser_gff(args_tar.srnas, "sRNA") prefixs = [] self._gen_seq(prefixs, args_tar) if ("RNAplex" in args_tar.program): self._rna_plex(prefixs, args_tar, log) self.helper.remove_all_content(self.target_seq_path, "_target_", "file") log.write("The temporary files for running RNAplex are deleted.\n") if ("RNAup" in args_tar.program): self._rnaup(prefixs, args_tar, log) if ("IntaRNA" in args_tar.program): self._intarna(prefixs, args_tar, log) self._merge_rnaplex_rnaup(prefixs, args_tar, log) self.helper.remove_all_content(args_tar.out_folder, self.tmps["tmp"], "dir") self.helper.remove_all_content(args_tar.out_folder, self.tmps["tmp"], "file") self.helper.remove_tmp_dir(args_tar.gffs) self.helper.remove_tmp_dir(args_tar.srnas) self.helper.remove_tmp_dir(args_tar.fastas) self.helper.remove_all_content(self.srna_seq_path, "tmp_", "file")
class sRNATargetPrediction(object): '''detection of sRNA-target interaction''' def __init__(self, args_tar): self.multiparser = Multiparser() self.helper = Helper() self.fixer = FormatFixer() self.gff_parser = Gff3Parser() self.target_seq_path = os.path.join(args_tar.out_folder, "target_seqs") self.srna_seq_path = os.path.join(args_tar.out_folder, "sRNA_seqs") self.rnaplex_path = os.path.join(args_tar.out_folder, "RNAplex_results") self.rnaup_path = os.path.join(args_tar.out_folder, "RNAup_results") self.merge_path = os.path.join(args_tar.out_folder, "merged_results") self.srna_path = os.path.join(args_tar.srnas, "tmp") self.fasta_path = os.path.join(args_tar.fastas, "tmp") self.gff_path = os.path.join(args_tar.gffs, "tmp") self.tmps = { "tmp": "tmp_srna_target", "rnaup": "tmp_rnaup", "log": "tmp_log", "all_fa": "tmp*.fa", "all_txt": "tmp*.txt" } def _check_gff(self, gffs): for gff in os.listdir(gffs): if gff.endswith(".gff"): self.helper.check_uni_attributes(os.path.join(gffs, gff)) def _run_rnaplfold(self, rnaplfold_path, file_type, win_size, span, unstr_region, seq_path, prefix, out_path): current = os.getcwd() os.chdir(out_path) command = " ".join([ rnaplfold_path, "-W", str(win_size), "-L", str(span), "-u", str(unstr_region), "-O" ]) if file_type == "sRNA": os.system("<".join([ command, os.path.join( current, seq_path, "_".join([self.tmps["tmp"], prefix, file_type + ".fa"])) ])) else: os.system("<".join([ command, os.path.join(current, seq_path, "_".join([prefix, file_type + ".fa"])) ])) os.chdir(current) def _wait_process(self, processes): for p in processes: p.wait() if p.stdout: p.stdout.close() if p.stdin: p.stdin.close() if p.stderr: p.stderr.close() try: p.kill() except OSError: pass time.sleep(5) def _sort_srna_fasta(self, fasta, prefix, path): out = open( os.path.join(path, "_".join([self.tmps["tmp"], prefix, "sRNA.fa"])), "w") srnas = [] with open(fasta) as f_h: for line in f_h: line = line.strip() if line.startswith(">"): name = line[1:] else: srnas.append({"name": name, "seq": line, "len": len(line)}) srnas = sorted(srnas, key=lambda x: (x["len"])) for srna in srnas: out.write(">" + srna["name"].split("|")[0] + "\n") out.write(srna["seq"] + "\n") out.close() def _read_fasta(self, fasta_file): seq = "" with open(fasta_file, "r") as seq_f: for line in seq_f: line = line.strip() if line.startswith(">"): continue else: seq = seq + line return seq def _get_specific_seq(self, srna_file, seq_file, srna_out, querys): for query in querys: srna_datas = query.split(":") srna = { "seq_id": srna_datas[0], "strand": srna_datas[3], "start": int(srna_datas[1]), "end": int(srna_datas[2]) } gff_f = open(srna_file, "r") out = open(srna_out, "a") seq = self._read_fasta(seq_file) num = 0 detect = False for entry in self.gff_parser.entries(gff_f): if (entry.seq_id == srna["seq_id"]) and ( entry.strand == srna["strand"]) and ( entry.start == srna["start"]) and (entry.end == srna["end"]): detect = True if "ID" in entry.attributes.keys(): id_ = entry.attributes["ID"] else: id_ = entry.feature + str(num) gene = self.helper.extract_gene(seq, entry.start, entry.end, entry.strand) out.write(">{0}|{1}|{2}|{3}|{4}\n{5}\n".format( id_, entry.seq_id, entry.start, entry.end, entry.strand, gene)) num += 1 if not detect: print("Error: Some of the query sRNAs do not exist!") sys.exit() gff_f.close() out.close() def _gen_seq(self, prefixs, args_tar): print("Generating sRNA fasta files") for srna in os.listdir(self.srna_path): if srna.endswith("_sRNA.gff"): prefix = srna.replace("_sRNA.gff", "") prefixs.append(prefix) srna_out = os.path.join(self.srna_seq_path, "_".join([prefix, "sRNA.fa"])) if "all" in args_tar.query: self.helper.get_seq( os.path.join(self.srna_path, srna), os.path.join(self.fasta_path, prefix + ".fa"), srna_out) else: if "_".join([prefix, "sRNA.fa"]) in os.listdir(self.srna_seq_path): os.remove(srna_out) self._get_specific_seq( os.path.join(self.srna_path, srna), os.path.join(self.fasta_path, prefix + ".fa"), srna_out, args_tar.query) self._sort_srna_fasta(srna_out, prefix, self.srna_seq_path) print("Generating target fasta files") for gff in os.listdir(self.gff_path): if gff.endswith(".gff"): prefix = gff.replace(".gff", "") potential_target(os.path.join(self.gff_path, gff), os.path.join(self.fasta_path, prefix + ".fa"), os.path.join(self.target_seq_path), args_tar) file_num = 1 num = 0 sub_prefix = os.path.join(self.target_seq_path, "_".join([prefix, "target"])) sub_out = open("_".join([sub_prefix, str(file_num) + ".fa"]), "w") with open((sub_prefix + ".fa"), "r") as t_f: for line in t_f: line = line.strip() if line.startswith(">"): num += 1 if (num == 100): num = 0 file_num += 1 sub_out.close() sub_out = open( "_".join([sub_prefix, str(file_num) + ".fa"]), "w") sub_out.write(line + "\n") sub_out.close() def _run_rnaplex(self, prefix, rnaplfold_folder, args_tar): print("Running RNAplex of {0}".format(prefix)) num_process = 0 processes = [] for seq in os.listdir(self.target_seq_path): if (prefix in seq) and ("_target_" in seq): print("Running RNAplex with {0}".format(seq)) out_rnaplex = open( os.path.join( self.rnaplex_path, prefix, "_".join( [prefix, "RNAplex", str(num_process) + ".txt"])), "w") num_process += 1 p = Popen([ args_tar.rnaplex_path, "-q", os.path.join( self.srna_seq_path, "_".join( [self.tmps["tmp"], prefix, "sRNA.fa"])), "-t", os.path.join(self.target_seq_path, seq), "-l", str(args_tar.inter_length), "-e", str(args_tar.energy), "-z", str(args_tar.duplex_dist), "-a", rnaplfold_folder ], stdout=out_rnaplex) processes.append(p) if num_process % args_tar.core_plex == 0: self._wait_process(processes) self._wait_process(processes) return num_process def _rna_plex(self, prefixs, args_tar): for prefix in prefixs: print("Running RNAplfold of {0}".format(prefix)) self.helper.check_make_folder( os.path.join(self.rnaplex_path, prefix)) rnaplfold_folder = os.path.join(self.rnaplex_path, prefix, "RNAplfold") os.mkdir(rnaplfold_folder) self._run_rnaplfold(args_tar.rnaplfold_path, "sRNA", args_tar.win_size_s, args_tar.span_s, args_tar.unstr_region_rnaplex_s, self.srna_seq_path, prefix, rnaplfold_folder) self._run_rnaplfold(args_tar.rnaplfold_path, "target", args_tar.win_size_t, args_tar.span_t, args_tar.unstr_region_rnaplex_t, self.target_seq_path, prefix, rnaplfold_folder) num_process = self._run_rnaplex(prefix, rnaplfold_folder, args_tar) rnaplex_file = os.path.join(self.rnaplex_path, prefix, "_".join([prefix, "RNAplex.txt"])) if ("_".join([prefix, "RNAplex.txt"]) in os.listdir(os.path.join(self.rnaplex_path, prefix))): os.remove(rnaplex_file) for index in range(0, num_process): self.helper.merge_file( os.path.join( self.rnaplex_path, prefix, "_".join([prefix, "RNAplex", str(index) + ".txt"])), rnaplex_file) self.helper.remove_all_content( os.path.join(self.rnaplex_path, prefix), "_RNAplex_", "file") self.fixer.fix_rnaplex(rnaplex_file, self.tmps["tmp"]) shutil.move(self.tmps["tmp"], rnaplex_file) shutil.rmtree(rnaplfold_folder) def _run_rnaup(self, num_up, processes, out_rnaup, out_log, args_tar): for index in range(1, num_up + 1): out_tmp_up = open( os.path.join(args_tar.out_folder, "".join([self.tmps["rnaup"], str(index), ".txt"])), "w") out_err = open( os.path.join(args_tar.out_folder, "".join([self.tmps["log"], str(index), ".txt"])), "w") in_up = open( os.path.join(args_tar.out_folder, "".join([self.tmps["tmp"], str(index), ".fa"])), "r") p = Popen([ args_tar.rnaup_path, "-u", str(args_tar.unstr_region_rnaup), "-o", "--interaction_first" ], stdin=in_up, stdout=out_tmp_up, stderr=out_err) processes.append(p) if len(processes) != 0: time.sleep(5) self._wait_process(processes) os.system("rm " + os.path.join(args_tar.out_folder, self.tmps["all_fa"])) self._merge_txt(num_up, out_rnaup, out_log, args_tar.out_folder) os.system("rm " + os.path.join(args_tar.out_folder, self.tmps["all_txt"])) def _merge_txt(self, num_up, out_rnaup, out_log, out_folder): for index in range(1, num_up + 1): self.helper.merge_file( os.path.join(out_folder, "".join([self.tmps["rnaup"], str(index), ".txt"])), out_rnaup) self.helper.merge_file( os.path.join(out_folder, "".join([self.tmps["log"], str(index), ".txt"])), out_log) def _get_continue(self, out_rnaup): '''For RNAup, it can continue running RNAup based on previous run''' srnas = [] matchs = {} out = open("tmp.txt", "w") with open(out_rnaup) as f_h: for line in f_h: line = line.strip() if ">srna" in line: srna = line[1:] srnas.append(srna) matchs[srna] = [] else: matchs[srna].append(line) srnas = srnas[:-1] for srna in srnas: out.write(">" + srna + "\n") for target in matchs[srna]: out.write(target + "\n") out.close() os.remove(out_rnaup) shutil.move("tmp.txt", out_rnaup) return srnas def _rnaup(self, prefixs, args_tar): for prefix in prefixs: srnas = [] print("Running RNAup of {0}".format(prefix)) if not os.path.exists(os.path.join(self.rnaup_path, prefix)): os.mkdir(os.path.join(self.rnaup_path, prefix)) num_up = 0 processes = [] out_rnaup = os.path.join(self.rnaup_path, prefix, "_".join([prefix + "_RNAup.txt"])) out_log = os.path.join(self.rnaup_path, prefix, "_".join([prefix + "_RNAup.log"])) if "_".join([prefix, "RNAup.txt"]) in \ os.listdir(os.path.join(self.rnaup_path, prefix)): if not args_tar.continue_rnaup: os.remove(out_rnaup) os.remove(out_log) else: srnas = self._get_continue(out_rnaup) with open( os.path.join( self.srna_seq_path, "_".join([self.tmps["tmp"], prefix, "sRNA.fa"])), "r") as s_f: for line in s_f: line = line.strip() if line.startswith(">"): if line[1:] in srnas: start = False continue start = True print("Running RNAup with {0}".format(line[1:])) num_up += 1 out_up = open( os.path.join( args_tar.out_folder, "".join([self.tmps["tmp"], str(num_up), ".fa"])), "w") out_up.write(line + "\n") else: if start: out_up.write(line + "\n") out_up.close() self.helper.merge_file( os.path.join(self.target_seq_path, "_".join([prefix, "target.fa"])), os.path.join( args_tar.out_folder, "".join( [self.tmps["tmp"], str(num_up), ".fa"]))) if num_up == args_tar.core_up: self._run_rnaup(num_up, processes, out_rnaup, out_log, args_tar) processes = [] num_up = 0 self._run_rnaup(num_up, processes, out_rnaup, out_log, args_tar) def _merge_rnaplex_rnaup(self, prefixs, args_tar): '''merge the result of RNAup and RNAplex''' for prefix in prefixs: rnaplex_file = None rnaup_file = None out_rnaplex = None out_rnaup = None self.helper.check_make_folder(os.path.join(self.merge_path, prefix)) print("Ranking {0} now".format(prefix)) if (args_tar.program == "both") or (args_tar.program == "RNAplex"): rnaplex_file = os.path.join(self.rnaplex_path, prefix, "_".join([prefix, "RNAplex.txt"])) out_rnaplex = os.path.join( self.rnaplex_path, prefix, "_".join([prefix, "RNAplex_rank.csv"])) if (args_tar.program == "both") or (args_tar.program == "RNAup"): rnaup_file = os.path.join(self.rnaup_path, prefix, "_".join([prefix, "RNAup.txt"])) out_rnaup = os.path.join(self.rnaup_path, prefix, "_".join([prefix, "RNAup_rank.csv"])) merge_srna_target( rnaplex_file, rnaup_file, args_tar, out_rnaplex, out_rnaup, os.path.join(self.fasta_path, prefix + ".fa"), os.path.join(self.merge_path, prefix, "_".join([prefix, "merge.csv"])), os.path.join(self.merge_path, prefix, "_".join([prefix, "overlap.csv"])), os.path.join(self.srna_path, "_".join([prefix, "sRNA.gff"])), os.path.join(self.gff_path, prefix + ".gff")) def run_srna_target_prediction(self, args_tar): self._check_gff(args_tar.gffs) self._check_gff(args_tar.srnas) self.multiparser.parser_gff(args_tar.gffs, None) self.multiparser.parser_fasta(args_tar.fastas) self.multiparser.parser_gff(args_tar.srnas, "sRNA") prefixs = [] self._gen_seq(prefixs, args_tar) if (args_tar.program == "both") or (args_tar.program == "RNAplex"): self._rna_plex(prefixs, args_tar) self.helper.remove_all_content(self.target_seq_path, "_target_", "file") # if (args_tar.program == "RNAplex") or ( # args_tar.program == "both"): # for strain in os.listdir(os.path.join( # args_tar.out_folder, "RNAplex_results")): # shutil.rmtree(os.path.join(args_tar.out_folder, "RNAplex_results", # strain, "RNAplfold")) if (args_tar.program == "both") or (args_tar.program == "RNAup"): self._rnaup(prefixs, args_tar) self._merge_rnaplex_rnaup(prefixs, args_tar) self.helper.remove_all_content(args_tar.out_folder, self.tmps["tmp"], "dir") self.helper.remove_all_content(args_tar.out_folder, self.tmps["tmp"], "file") self.helper.remove_tmp_dir(args_tar.gffs) self.helper.remove_tmp_dir(args_tar.srnas) self.helper.remove_tmp_dir(args_tar.fastas) self.helper.remove_all_content(self.srna_seq_path, "tmp_", "file")
class sRNATargetPrediction(object): '''detection of sRNA-target interaction''' def __init__(self, args_tar): self.multiparser = Multiparser() self.helper = Helper() self.fixer = FormatFixer() self.gff_parser = Gff3Parser() self.target_seq_path = os.path.join(args_tar.out_folder, "target_seqs") self.srna_seq_path = os.path.join(args_tar.out_folder, "sRNA_seqs") self.rnaplex_path = os.path.join(args_tar.out_folder, "RNAplex_results") self.rnaup_path = os.path.join(args_tar.out_folder, "RNAup_results") self.intarna_path = os.path.join(args_tar.out_folder, "IntaRNA_results") self.merge_path = os.path.join(args_tar.out_folder, "merged_results") self.srna_path = os.path.join(args_tar.srnas, "tmp") self.fasta_path = os.path.join(args_tar.fastas, "tmp") self.gff_path = os.path.join(args_tar.gffs, "tmp") self.tmps = { "tmp": "tmp_srna_target", "rnaup": "tmp_rnaup", "log": "tmp_log", "all_fa": "tmp*.fa", "all_txt": "tmp*.txt" } def _check_gff(self, gffs): for gff in os.listdir(gffs): if gff.endswith(".gff"): self.helper.check_uni_attributes(os.path.join(gffs, gff)) def _check_long_id(self, seq_file, long_ids, type_): out_file = seq_file + "_tmp.fa" out = open(out_file, "w") with open(seq_file) as f_h: for line in f_h: line = line.strip() if line.startswith(">"): if len(line) > 40: long_ids[type_].append(line[1:]) out.write(">TMP" + type_ + "_" + str(len(long_ids[type_])) + "\n") else: out.write(line + "\n") else: out.write(line + "\n") out.close() return out_file def _run_rnaplfold(self, rnaplfold_path, file_type, win_size, span, unstr_region, long_ids, seq_path, prefix, out_path, log): current = os.getcwd() os.chdir(out_path) command = " ".join([ rnaplfold_path, "-W", str(win_size), "-L", str(span), "-u", str(unstr_region), "-O" ]) if file_type == "sRNA": srna_seq_file = os.path.join( current, seq_path, "_".join([self.tmps["tmp"], prefix, file_type + ".fa"])) out_file = self._check_long_id(srna_seq_file, long_ids, "srna") log.write("<".join([command, out_file]) + "\n") os.system("<".join([command, out_file])) else: tar_seq_file = os.path.join(current, seq_path, "_".join([prefix, file_type + ".fa"])) for tar_seq_file in os.listdir(os.path.join(current, seq_path)): if (prefix + "_" + file_type + "_") in tar_seq_file: out_file = self._check_long_id( os.path.join(current, seq_path, tar_seq_file), long_ids, "tar") log.write("<".join([command, out_file]) + "\n") os.system("<".join([command, out_file])) os.chdir(current) def _wait_process(self, processes): for p in processes: p.wait() if p.stdout: p.stdout.close() if p.stdin: p.stdin.close() if p.stderr: p.stderr.close() try: p.kill() except OSError: pass time.sleep(5) def _sort_srna_fasta(self, fasta, prefix, path): out = open( os.path.join(path, "_".join([self.tmps["tmp"], prefix, "sRNA.fa"])), "w") srnas = [] with open(fasta) as f_h: for line in f_h: line = line.strip() if line.startswith(">"): name = line[1:] else: srnas.append({"name": name, "seq": line, "len": len(line)}) srnas = sorted(srnas, key=lambda x: (x["len"])) for srna in srnas: out.write(">" + srna["name"].split("|")[0] + "\n") out.write(srna["seq"] + "\n") out.close() def _read_fasta(self, fasta_file): seq = "" with open(fasta_file, "r") as seq_f: for line in seq_f: line = line.strip() if line.startswith(">"): continue else: seq = seq + line return seq def _get_specific_seq(self, srna_file, seq_file, srna_out, querys): for query in querys: srna_datas = query.split(":") srna = { "seq_id": srna_datas[0], "strand": srna_datas[3], "start": int(srna_datas[1]), "end": int(srna_datas[2]) } gff_f = open(srna_file, "r") out = open(srna_out, "a") seq = self._read_fasta(seq_file) num = 0 detect = False for entry in self.gff_parser.entries(gff_f): if (entry.seq_id == srna["seq_id"]) and ( entry.strand == srna["strand"]) and ( entry.start == srna["start"]) and (entry.end == srna["end"]): detect = True if "ID" in entry.attributes.keys(): id_ = entry.attributes["ID"] else: id_ = entry.feature + str(num) gene = self.helper.extract_gene(seq, entry.start, entry.end, entry.strand) out.write(">{0}|{1}|{2}|{3}|{4}\n{5}\n".format( id_, entry.seq_id, entry.start, entry.end, entry.strand, gene)) num += 1 if not detect: print("Error: Some of the query sRNAs do not exist!") sys.exit() gff_f.close() out.close() def _gen_seq(self, prefixs, target_prefixs, args_tar): print("Generating sRNA fasta files") for gff in os.listdir(self.gff_path): if gff.endswith(".gff"): prefix = gff.replace(".gff", "") target_prefixs.append(prefix) detect = False for gff in os.listdir(self.gff_path): if gff.endswith(".gff"): prefix = gff.replace(".gff", "") potential_target(os.path.join(self.gff_path, gff), os.path.join(self.fasta_path, prefix + ".fa"), os.path.join(self.target_seq_path), args_tar, target_prefixs) file_num = 1 num = 0 sub_prefix = os.path.join(self.target_seq_path, "_".join([prefix, "target"])) if os.path.exists(sub_prefix + ".fa"): sub_out = open( "_".join([sub_prefix, str(file_num) + ".fa"]), "w") with open((sub_prefix + ".fa"), "r") as t_f: for line in t_f: line = line.strip() if line.startswith(">"): # line = line.replace("|", "_") num += 1 if (num == 100): num = 0 file_num += 1 sub_out.close() sub_out = open( "_".join( [sub_prefix, str(file_num) + ".fa"]), "w") detect = True sub_out.write(line + "\n") sub_out.close() else: open(sub_prefix + ".fa", "w").close() if not detect: print("No assigned features can be found. " "Please check your genome annotation. " "And assign correct features to --target_feature.") sys.exit() print("Generating sRNA fasta files") for srna in os.listdir(self.srna_path): if srna.endswith("_sRNA.gff"): prefix = srna.replace("_sRNA.gff", "") prefixs.append(prefix) srna_out = os.path.join(self.srna_seq_path, "_".join([prefix, "sRNA.fa"])) if "all" in args_tar.query: self.helper.get_seq( os.path.join(self.srna_path, srna), os.path.join(self.fasta_path, prefix + ".fa"), srna_out) else: if "_".join([prefix, "sRNA.fa"]) in os.listdir(self.srna_seq_path): os.remove(srna_out) self._get_specific_seq( os.path.join(self.srna_path, srna), os.path.join(self.fasta_path, prefix + ".fa"), srna_out, args_tar.query) self._sort_srna_fasta(srna_out, prefix, self.srna_seq_path) def _run_rnaplex(self, prefix, rnaplfold_folder, args_tar, log): print("Running RNAplex of {0}".format(prefix)) num_process = 0 processes = [] for seq in os.listdir(self.target_seq_path): if ("_target_" in seq) and (".fa_tmp.fa" in seq): print("Running RNAplex with {0}".format( seq.replace(".fa_tmp.fa", ""))) out_rnaplex = open( os.path.join( self.rnaplex_path, prefix, "_".join( [prefix, "RNAplex", str(num_process) + ".txt"])), "w") num_process += 1 log.write(" ".join([ args_tar.rnaplex_path, "-q", os.path.join( self.srna_seq_path, "_".join([ self.tmps["tmp"], prefix, "sRNA.fa_tmp.fa" ])), "-t", os.path.join(self.target_seq_path, seq), "-l", str(args_tar.inter_length), "-e", str(args_tar.energy), "-z", str(args_tar.duplex_dist), "-a", rnaplfold_folder ]) + "\n") p = Popen([ args_tar.rnaplex_path, "-q", os.path.join( self.srna_seq_path, "_".join([ self.tmps["tmp"], prefix, "sRNA.fa_tmp.fa" ])), "-t", os.path.join(self.target_seq_path, seq), "-l", str(args_tar.inter_length), "-e", str(args_tar.energy), "-z", str(args_tar.duplex_dist), "-a", rnaplfold_folder ], stdout=out_rnaplex) processes.append(p) if num_process % args_tar.core_plex == 0: self._wait_process(processes) self._wait_process(processes) log.write("The prediction for {0} is done.\n".format(prefix)) log.write( "The following temporary files for storing results of {0} are " "generated:\n".format(prefix)) for file_ in os.listdir(os.path.join(self.rnaplex_path, prefix)): log.write("\t" + os.path.join(self.rnaplex_path, prefix, file_) + "\n") return num_process def _restore_long_ids(self, rnaplex_file, long_ids): out = open(rnaplex_file + "tmp", "w") with open(rnaplex_file, "r") as t_f: for line in t_f: line = line.strip() if (line.startswith(">")): if (line.startswith(">TMPtar_")): header = long_ids["tar"][int(line.split("_")[1]) - 1] elif (line.startswith(">TMPsrna_")): header = long_ids["srna"][int(line.split("_")[1]) - 1] else: header = line[1:] out.write(">" + header + "\n") else: out.write(line + "\n") out.close() shutil.move(rnaplex_file + "tmp", rnaplex_file) def _rna_plex(self, prefixs, target_prefixs, args_tar, log): log.write("Using RNAplex and RNAplfold to predict sRNA targets.\n") log.write("Please make sure the version of Vienna RNA package is " "at least 2.3.2.\n") tmp_rnaplfold_folder = os.path.join(self.rnaplex_path, "tmp_RNAplfold") if os.path.exists(tmp_rnaplfold_folder): shutil.rmtree(tmp_rnaplfold_folder) os.mkdir(tmp_rnaplfold_folder) long_ids = {"tar": [], "srna": []} for prefix in target_prefixs: self._run_rnaplfold(args_tar.rnaplfold_path, "target", args_tar.win_size_t, args_tar.span_t, args_tar.unstr_region_rnaplex_t, long_ids, self.target_seq_path, prefix, tmp_rnaplfold_folder, log) for prefix in prefixs: print("Running RNAplfold of {0}".format(prefix)) self.helper.check_make_folder( os.path.join(self.rnaplex_path, prefix)) rnaplfold_folder = os.path.join(self.rnaplex_path, prefix, "RNAplfold") shutil.copytree(tmp_rnaplfold_folder, rnaplfold_folder) self._run_rnaplfold(args_tar.rnaplfold_path, "sRNA", args_tar.win_size_s, args_tar.span_s, args_tar.unstr_region_rnaplex_s, long_ids, self.srna_seq_path, prefix, rnaplfold_folder, log) num_process = self._run_rnaplex(prefix, rnaplfold_folder, args_tar, log) rnaplex_file = os.path.join(self.rnaplex_path, prefix, "_".join([prefix, "RNAplex.txt"])) if ("_".join([prefix, "RNAplex.txt"]) in os.listdir(os.path.join(self.rnaplex_path, prefix))): os.remove(rnaplex_file) for index in range(0, num_process): log.write("Using helper.py to merge the temporary files.\n") self.helper.merge_file( os.path.join( self.rnaplex_path, prefix, "_".join([prefix, "RNAplex", str(index) + ".txt"])), rnaplex_file) if (len(long_ids["tar"]) != 0) or (len(long_ids["srna"]) != 0): self._restore_long_ids(rnaplex_file, long_ids) log.write("\t" + rnaplex_file + " is generated.\n") self.helper.remove_all_content( os.path.join(self.rnaplex_path, prefix), "_RNAplex_", "file") self.fixer.fix_rnaplex(rnaplex_file, self.tmps["tmp"]) shutil.move(self.tmps["tmp"], rnaplex_file) shutil.rmtree(rnaplfold_folder) def _run_rnaup(self, num_up, processes, prefix, out_rnaup, out_log, args_tar, log): for index in range(1, num_up + 1): out_tmp_up = open( os.path.join(args_tar.out_folder, "".join([self.tmps["rnaup"], str(index), ".txt"])), "w") out_err = open( os.path.join(args_tar.out_folder, "".join([self.tmps["log"], str(index), ".txt"])), "w") in_up = open( os.path.join(args_tar.out_folder, "".join([self.tmps["tmp"], str(index), ".fa"])), "r") log.write(" ".join([ args_tar.rnaup_path, "-u", str(args_tar.unstr_region_rnaup), "-o", "--interaction_first" ]) + "\n") p = Popen([ args_tar.rnaup_path, "-u", str(args_tar.unstr_region_rnaup), "-o", "--interaction_first" ], stdin=in_up, stdout=out_tmp_up, stderr=out_err) processes.append(p) if len(processes) != 0: time.sleep(5) self._wait_process(processes) log.write( "The following temporary files for storing results of {0} are " "generated:\n".format(prefix)) for file_ in os.listdir(os.path.join(args_tar.out_folder)): log.write("\t" + os.path.join(args_tar.out_folder, file_) + "\n") os.system("rm " + os.path.join(args_tar.out_folder, self.tmps["all_fa"])) self._merge_txt(num_up, out_rnaup, out_log, args_tar.out_folder) os.system("rm " + os.path.join(args_tar.out_folder, self.tmps["all_txt"])) def _merge_txt(self, num_up, out_rnaup, out_log, out_folder): for index in range(1, num_up + 1): self.helper.merge_file( os.path.join(out_folder, "".join([self.tmps["rnaup"], str(index), ".txt"])), out_rnaup) self.helper.merge_file( os.path.join(out_folder, "".join([self.tmps["log"], str(index), ".txt"])), out_log) def _get_continue(self, out_rnaup): '''For RNAup, it can continue running RNAup based on previous run''' srnas = [] matchs = {} out = open("tmp.txt", "w") with open(out_rnaup) as f_h: for line in f_h: line = line.strip() if ">srna" in line: srna = line[1:] srnas.append(srna) matchs[srna] = [] else: matchs[srna].append(line) srnas = srnas[:-1] for srna in srnas: out.write(">" + srna + "\n") for target in matchs[srna]: out.write(target + "\n") out.close() os.remove(out_rnaup) shutil.move("tmp.txt", out_rnaup) return srnas def _rnaup(self, prefixs, target_prefixs, args_tar, log): log.write("Using RNAup to predict sRNA targets.\n") log.write("Please make sure the version of Vienna RNA package is " "at least 2.3.2.\n") for prefix in prefixs: srnas = [] print("Running RNAup of {0}".format(prefix)) if not os.path.exists(os.path.join(self.rnaup_path, prefix)): os.mkdir(os.path.join(self.rnaup_path, prefix)) num_up = 0 processes = [] out_rnaup = os.path.join(self.rnaup_path, prefix, "_".join([prefix + "_RNAup.txt"])) out_log = os.path.join(self.rnaup_path, prefix, "_".join([prefix + "_RNAup.log"])) if "_".join([prefix, "RNAup.txt"]) in \ os.listdir(os.path.join(self.rnaup_path, prefix)): if not args_tar.continue_rnaup: os.remove(out_rnaup) os.remove(out_log) else: log.write("The data from the previous run is found.\n") srnas = self._get_continue(out_rnaup) log.write("The previous data is loaded.\n") with open( os.path.join( self.srna_seq_path, "_".join([self.tmps["tmp"], prefix, "sRNA.fa"])), "r") as s_f: for line in s_f: line = line.strip() if line.startswith(">"): if line[1:] in srnas: start = False continue start = True print("Running RNAup with {0}".format(line[1:])) num_up += 1 out_up = open( os.path.join( args_tar.out_folder, "".join([self.tmps["tmp"], str(num_up), ".fa"])), "w") out_up.write(line + "\n") else: if start: out_up.write(line + "\n") out_up.close() for prefix in target_prefixs: self.helper.merge_file( os.path.join( self.target_seq_path, "_".join([prefix, "target.fa"])), os.path.join( args_tar.out_folder, "".join([ self.tmps["tmp"], str(num_up), ".fa" ]))) if num_up == args_tar.core_up: self._run_rnaup(num_up, processes, prefix, out_rnaup, out_log, args_tar, log) processes = [] num_up = 0 self._run_rnaup(num_up, processes, prefix, out_rnaup, out_log, args_tar, log) log.write("The prediction for {0} is done.\n".format(prefix)) log.write("\t" + out_rnaup + " is complete generated and updated.\n") def _intarna(self, prefixs, target_prefixs, args_tar, log): log.write("Using IntaRNA to predict sRNA targets.\n") log.write( "Please make sure the version of IntaRNA is at least 2.0.4.\n") all_target = os.path.join(self.target_seq_path, "all_target.fa") if os.path.exists(all_target): os.remove(all_target) for prefix in target_prefixs: self.helper.merge_file( os.path.join(self.target_seq_path, prefix + "_target.fa"), all_target) for prefix in prefixs: print("Running IntaRNA of {0}".format(prefix)) intarna_file = os.path.join(self.intarna_path, prefix, prefix + "_IntaRNA.txt") self.helper.check_make_folder( os.path.join(self.intarna_path, prefix)) call([ args_tar.intarna_path, "-q", os.path.join(self.srna_seq_path, "_".join([self.tmps["tmp"], prefix, "sRNA.fa"])), "-t", all_target, "--qAccW", str(args_tar.slide_win_srna), "--qAccL", str(args_tar.max_loop_srna), "--tAccW", str(args_tar.slide_win_target), "--tAccL", str(args_tar.max_loop_target), "--outMode", "C", "-m", args_tar.mode_intarna, "--threads", str(args_tar.core_inta), "--out", intarna_file ]) log.write("The prediction for {0} is done.\n".format(prefix)) log.write("\t" + intarna_file + " is generated.\n") def _merge_rnaplex_rnaup(self, prefixs, target_prefixs, args_tar, log): '''merge the result of IntaRNA, RNAup and RNAplex''' log.write( "Running merge_rnaplex_rnaup.py to merge the results from " "RNAplex, RNAup, and IntaRNA for generating finanl output.\n") log.write("The following files are generated:\n") all_gff = os.path.join(self.gff_path, "all.gff") if os.path.exists(all_gff): os.remove(all_gff) for prefix in target_prefixs: self.helper.merge_file( os.path.join(self.gff_path, prefix + ".gff"), all_gff) for prefix in prefixs: rnaplex_file = None rnaup_file = None out_rnaplex = None out_rnaup = None intarna_file = None out_intarna = None self.helper.check_make_folder(os.path.join(self.merge_path, prefix)) print("Ranking {0} now".format(prefix)) if ("RNAplex" in args_tar.program): rnaplex_file = os.path.join(self.rnaplex_path, prefix, "_".join([prefix, "RNAplex.txt"])) out_rnaplex = os.path.join( self.rnaplex_path, prefix, "_".join([prefix, "RNAplex_rank.csv"])) self._remove_repeat(rnaplex_file, "RNAplex") if ("RNAup" in args_tar.program): rnaup_file = os.path.join(self.rnaup_path, prefix, "_".join([prefix, "RNAup.txt"])) out_rnaup = os.path.join(self.rnaup_path, prefix, "_".join([prefix, "RNAup_rank.csv"])) self._remove_repeat(rnaup_file, "RNAup") if ("IntaRNA" in args_tar.program): intarna_file = os.path.join(self.intarna_path, prefix, "_".join([prefix, "IntaRNA.txt"])) out_intarna = os.path.join( self.intarna_path, prefix, "_".join([prefix, "IntaRNA_rank.csv"])) self._remove_repeat(intarna_file, "IntaRNA") overlap_file = os.path.join(self.merge_path, prefix, "_".join([prefix, "overlap.csv"])) merge_file = os.path.join(self.merge_path, prefix, "_".join([prefix, "merge.csv"])) merge_srna_target( rnaplex_file, rnaup_file, intarna_file, args_tar, out_rnaplex, out_rnaup, out_intarna, os.path.join(self.fasta_path, prefix + ".fa"), merge_file, overlap_file, os.path.join(self.srna_path, "_".join([prefix, "sRNA.gff"])), all_gff, target_prefixs) if ("RNAplex" in args_tar.program): log.write("\t" + out_rnaplex + "\n") if ("RNAup" in args_tar.program): log.write("\t" + out_rnaup + "\n") if ("IntaRNA" in args_tar.program): log.write("\t" + out_intarna + "\n") if (os.path.exists(merge_file)): log.write("\t" + merge_file + "\n") if (os.path.exists(overlap_file)): log.write("\t" + overlap_file + "\n") def _remove_rnaplex(self, line, num, pre_num, pre, checks, out_tmp, print_): if (line.startswith(">")): if (num % 2 == 1): print_ = False pre = line if (line not in checks): checks[line] = [] print_ = True elif (num % 2 == 0) and (line not in checks[pre]): checks[pre].append(line) print_ = True num = num + 1 else: if (print_): if (num != pre_num): out_tmp.write(pre + "\n") out_tmp.write(checks[pre][-1] + "\n") out_tmp.write(line + "\n") pre_num = num return num, pre_num, print_, pre, def _remove_rnaup(self, line, pre, num, pre_num, srna_info, checks, out_tmp, print_, tar): if (line.startswith(">")): print_ = False tar = False if (pre.startswith(">")): if (pre not in checks): checks[pre] = [line] srna_info = pre print_ = True else: if (line not in checks[pre]): checks[pre].append(line) print_ = True else: if (num != 1): if (line not in checks[srna_info]): checks[srna_info].append(line) print_ = True else: if (print_): if (pre_num != len(checks)): out_tmp.write(srna_info + "\n") out_tmp.write(checks[srna_info][-1] + "\n") out_tmp.write(line + "\n") else: if (not tar): out_tmp.write(checks[srna_info][-1] + "\n") out_tmp.write(line + "\n") pre_num = len(checks) tar = True pre = line num = num + 1 return num, pre_num, print_, pre, tar, srna_info def _remove_intarna(self, line, checks, tar, srna_info, seq, out_tmp): if (line.startswith(".")) or (line.startswith("(")) or ( line.startswith(")")): seq = line.split(";")[0] if (seq not in checks[tar][srna_info]): checks[tar][srna_info].append(seq) out_tmp.write(line + "\n") else: if (len(line.split(";")) >= 8): tar = line.split(";")[0] srna_info = line.split(";")[3] seq = line.split(";")[7] if (tar not in checks): checks[tar] = {} checks[tar][srna_info] = [seq] out_tmp.write(line + "\n") else: if (srna_info not in checks[tar]): checks[tar][srna_info] = [seq] out_tmp.write(line + "\n") return tar, srna_info, seq def _remove_repeat(self, interact_file, type_): checks = {} seq = "" pre = "" srna_info = "" num = 1 tar = False pre_num = 0 print_ = False out_tmp = open(interact_file + "tmp", "w") with open(interact_file) as fh: for line in fh: line = line.strip() if (type_ == "RNAplex"): num, pre_num, print_, pre = self._remove_rnaplex( line, num, pre_num, pre, checks, out_tmp, print_) elif (type_ == "RNAup"): num, pre_num, print_, pre, tar, srna_info = ( self._remove_rnaup(line, pre, num, pre_num, srna_info, checks, out_tmp, print_, tar)) elif (type_ == "IntaRNA"): tar, srna_info, seq = self._remove_intarna( line, checks, tar, srna_info, seq, out_tmp) out_tmp.close() shutil.move(interact_file + "tmp", interact_file) def run_srna_target_prediction(self, args_tar, log): self._check_gff(args_tar.gffs) self._check_gff(args_tar.srnas) self.multiparser.parser_gff(args_tar.gffs, None) self.multiparser.parser_fasta(args_tar.fastas) self.multiparser.parser_gff(args_tar.srnas, "sRNA") prefixs = [] target_prefixs = [] self._gen_seq(prefixs, target_prefixs, args_tar) if ("RNAplex" in args_tar.program): self._rna_plex(prefixs, target_prefixs, args_tar, log) self.helper.remove_all_content(self.target_seq_path, "_target_", "file") shutil.rmtree(os.path.join(self.rnaplex_path, "tmp_RNAplfold")) log.write("The temporary files for running RNAplex are deleted.\n") if ("RNAup" in args_tar.program): self._rnaup(prefixs, target_prefixs, args_tar, log) if ("IntaRNA" in args_tar.program): self._intarna(prefixs, target_prefixs, args_tar, log) self._merge_rnaplex_rnaup(prefixs, target_prefixs, args_tar, log) self.helper.remove_all_content(args_tar.out_folder, self.tmps["tmp"], "dir") self.helper.remove_all_content(args_tar.out_folder, self.tmps["tmp"], "file") self.helper.remove_tmp_dir(args_tar.gffs) self.helper.remove_tmp_dir(args_tar.srnas) self.helper.remove_tmp_dir(args_tar.fastas) self.helper.remove_all_content(self.srna_seq_path, "tmp_", "file") os.remove(os.path.join(self.target_seq_path, "all_target.fa"))
class SubLocal(object): def __init__(self, args_sub): self.multiparser = Multiparser() self.helper = Helper() self.fixer = FormatFixer() self.gff_path = os.path.join(args_sub.gffs, "tmp") self.fasta_path = os.path.join(args_sub.fastas, "tmp") if args_sub.trans is not None: self.tran_path = os.path.join(args_sub.trans, "tmp") else: self.tran_path = None self.out_all = os.path.join(args_sub.out_folder, "all_CDS") self.out_express = os.path.join(args_sub.out_folder, "expressed_CDS") self.all_tmp_path = os.path.join(self.out_all, "tmp") self.express_tmp_path = os.path.join(self.out_express, "tmp") self.all_stat_path = os.path.join(self.out_all, "statistics") self.express_stat_path = os.path.join(self.out_express, "statistics") self.all_tmp_result = os.path.join(self.out_all, "tmp_results") self.express_tmp_result = os.path.join(self.out_express, "tmp_results") self.all_result = os.path.join(self.out_all, "psortb_results") self.express_result = os.path.join(self.out_express, "psortb_results") self.endfix_table = "table.csv" self.endfix_raw = "raw.txt" self._make_folder() def _make_folder(self): self.helper.check_make_folder(self.out_all) self.helper.check_make_folder(self.out_express) self.helper.check_make_folder(self.all_stat_path) self.helper.check_make_folder(self.express_stat_path) self.helper.check_make_folder(self.all_result) self.helper.check_make_folder(self.express_result) def _compare_cds_tran(self, gff_file, tran_file): out = open(os.path.join(self.out_all, "tmp_cds.gff"), "w") cdss = [] fh = open(gff_file) th = open(tran_file) for entry in Gff3Parser().entries(fh): if entry.feature == "CDS": cdss.append(entry) trans = [] for entry in Gff3Parser().entries(th): trans.append(entry) for cds in cdss: for ta in trans: if (cds.strand == ta.strand) and ( cds.seq_id == ta.seq_id): if ((cds.end < ta.end) and ( cds.end > ta.start) and ( cds.start <= ta.start)) or ( (cds.start > ta.start) and ( cds.start < ta.end) and ( cds.end >= ta.end)) or ( (cds.end >= ta.end) and ( cds.start <= ta.start)) or ( (cds.end <= ta.end) and ( cds.start >= ta.start)): out.write(cds.info + "\n") break fh.close() th.close() out.close() def _get_protein_seq(self, gff, tmp_path, tran_path): prefix = gff.replace(".gff", "") fasta = self.helper.get_correct_file(self.fasta_path, ".fa", prefix, None, None) dna_seq_file = os.path.join(tmp_path, "_".join([prefix, "dna.fa"])) print("Generate CDS fasta files of {0}".format(prefix)) if tran_path is not None: self._compare_cds_tran(os.path.join(self.gff_path, gff), os.path.join(tran_path, "_".join([ prefix, "transcript.gff"]))) self.helper.get_cds_seq(os.path.join(self.out_all, "tmp_cds.gff"), fasta, dna_seq_file) os.remove(os.path.join(self.out_all, "tmp_cds.gff")) else: self.helper.get_cds_seq(os.path.join(self.gff_path, gff), fasta, dna_seq_file) print("transfer DNA seq to protein seq of {0}".format(prefix)) self.helper.translation(dna_seq_file, "tmp") prot_seq_file = os.path.join( tmp_path, "_".join([prefix, "protein.fa"])) self.fixer.fix_emboss("tmp", prot_seq_file) os.remove("tmp") return prefix def _psortb(self, psortb_path, strain_type, prot_seq_file, out_raw, out_err): call([psortb_path, strain_type, prot_seq_file], stdout=out_raw, stderr=out_err) def _run_psortb(self, args_sub, prefix, out_folder, tmp_path, tmp_result): print("Running psortb of {0}".format(prefix)) out_err = open(os.path.join(out_folder, "tmp_log"), "w") out_raw = open(os.path.join(tmp_result, "_".join([prefix, self.endfix_raw])), "w") prot_seq_file = os.path.join(tmp_path, "_".join([prefix, "protein.fa"])) if args_sub.gram == "positive": self._psortb(args_sub.psortb_path, "-p", prot_seq_file, out_raw, out_err) elif args_sub.gram == "negative": self._psortb(args_sub.psortb_path, "-n", prot_seq_file, out_raw, out_err) else: print("Error:It is not a proper bacteria type - {0}!!".format( args_sub.gram)) sys.exit() out_err.close() out_raw.close() def _extract_result(self, args_sub, tmp_psortb_path, prefix, gff_file): if args_sub.merge: print("Merge to gff...") extract_psortb(os.path.join( tmp_psortb_path, "_".join([prefix, self.endfix_raw])), os.path.join(tmp_psortb_path, "_".join([ prefix, self.endfix_table])), gff_file, os.path.join(prefix + ".gff"), args_sub.fuzzy) shutil.move(prefix + ".gff", gff_file) else: extract_psortb(os.path.join( tmp_psortb_path, "_".join([prefix, self.endfix_raw])), os.path.join(tmp_psortb_path, "_".join([ prefix, self.endfix_table])), None, None, args_sub.fuzzy) def _merge_and_stat(self, gffs, tmp_psortb_path, stat_path, psortb_result): for folder in os.listdir(gffs): if folder.endswith(".gff_folder"): prefix = folder.replace(".gff_folder", "") self.helper.check_make_folder( os.path.join(psortb_result, prefix)) merge_table = os.path.join( psortb_result, prefix, "_".join([prefix, self.endfix_table])) for gff in os.listdir(os.path.join(gffs, folder)): result = self.helper.get_correct_file( tmp_psortb_path, "_" + self.endfix_raw, gff.replace(".gff", ""), None, None) shutil.copy(result, os.path.join(psortb_result, prefix)) result = self.helper.get_correct_file( tmp_psortb_path, "_" + self.endfix_table, gff.replace(".gff", ""), None, None) self.helper.merge_file(result, merge_table) self.helper.check_make_folder(os.path.join(stat_path, prefix)) stat_sublocal(merge_table, os.path.join( stat_path, prefix, prefix), os.path.join( stat_path, prefix, "_".join([ "stat", prefix, "sublocal.csv"]))) def _remove_tmps(self, args_sub): self.helper.remove_tmp(args_sub.fastas) self.helper.remove_tmp(args_sub.gffs) self.helper.remove_all_content(args_sub.out_folder, "tmp", "dir") self.helper.remove_all_content(self.out_all, "tmp", "dir") self.helper.remove_all_content(self.out_express, "tmp", "dir") os.remove(os.path.join(self.out_all, "tmp_log")) if args_sub.trans is not None: os.remove(os.path.join(self.out_express, "tmp_log")) def run_sub_local(self, args_sub): for gff in os.listdir(args_sub.gffs): if gff.endswith(".gff"): self.helper.check_uni_attributes(os.path.join( args_sub.gffs, gff)) self.multiparser.parser_gff(args_sub.gffs, None) self.multiparser.parser_fasta(args_sub.fastas) if args_sub.trans is not None: self.multiparser.parser_gff(args_sub.trans, "transcript") self.helper.check_make_folder(self.express_tmp_path) self.helper.check_make_folder(self.express_tmp_result) self.helper.check_make_folder(self.all_tmp_path) self.helper.check_make_folder(self.all_tmp_result) for gff in os.listdir(self.gff_path): if args_sub.trans is not None: print("Running expressed gene now...") prefix = self._get_protein_seq(gff, self.express_tmp_path, self.tran_path) self._run_psortb(args_sub, prefix, self.out_express, self.express_tmp_path, self.express_tmp_result) self._extract_result(args_sub, self.express_tmp_result, prefix, os.path.join(self.gff_path, gff)) print("Running all gene now...") prefix = self._get_protein_seq(gff, self.all_tmp_path, None) self._run_psortb(args_sub, prefix, self.out_all, self.all_tmp_path, self.all_tmp_result) self._extract_result(args_sub, self.all_tmp_result, prefix, os.path.join(self.gff_path, gff)) self._merge_and_stat(args_sub.gffs, self.all_tmp_result, self.all_stat_path, self.all_result) if args_sub.trans is not None: self._merge_and_stat(args_sub.gffs, self.express_tmp_result, self.express_stat_path, self.express_result) self._remove_tmps(args_sub)
class SubLocal(object): '''detection of subcellular localization''' def __init__(self, args_sub): self.multiparser = Multiparser() self.helper = Helper() self.fixer = FormatFixer() self.gff_path = os.path.join(args_sub.gffs, "tmp") self.fasta_path = os.path.join(args_sub.fastas, "tmp") if args_sub.trans is not None: self.tran_path = os.path.join(args_sub.trans, "tmp") else: self.tran_path = None self.out_all = os.path.join(args_sub.out_folder, "all_CDSs") self.out_express = os.path.join(args_sub.out_folder, "expressed_CDSs") self.all_tmp_path = os.path.join(self.out_all, "tmp") self.express_tmp_path = os.path.join(self.out_express, "tmp") self.all_stat_path = os.path.join(self.out_all, "statistics") self.express_stat_path = os.path.join(self.out_express, "statistics") self.all_tmp_result = os.path.join(self.out_all, "tmp_results") self.express_tmp_result = os.path.join(self.out_express, "tmp_results") self.all_result = os.path.join(self.out_all, "psortb_results") self.express_result = os.path.join(self.out_express, "psortb_results") self.endfix_table = "table.csv" self.endfix_raw = "raw.txt" self._make_folder() def _make_folder(self): self.helper.check_make_folder(self.out_all) self.helper.check_make_folder(self.out_express) self.helper.check_make_folder(self.all_stat_path) self.helper.check_make_folder(self.express_stat_path) self.helper.check_make_folder(self.all_result) self.helper.check_make_folder(self.express_result) def _compare_cds_tran(self, gff_file, tran_file, log): '''compare CDS and transcript to find the expressed CDS''' log.write("Comparing transcripts and CDSs to get expressed CDSs.\n") out = open(os.path.join(self.out_all, "tmp_cds.gff"), "w") cdss = [] fh = open(gff_file) th = open(tran_file) for entry in Gff3Parser().entries(fh): if entry.feature == "CDS": cdss.append(entry) trans = [] for entry in Gff3Parser().entries(th): trans.append(entry) for cds in cdss: for ta in trans: if (cds.strand == ta.strand) and ( cds.seq_id == ta.seq_id): if ((cds.end < ta.end) and ( cds.end > ta.start) and ( cds.start <= ta.start)) or ( (cds.start > ta.start) and ( cds.start < ta.end) and ( cds.end >= ta.end)) or ( (cds.end >= ta.end) and ( cds.start <= ta.start)) or ( (cds.end <= ta.end) and ( cds.start >= ta.start)): out.write(cds.info + "\n") break fh.close() th.close() out.close() log.write("\t" + os.path.join(self.out_all, "tmp_cds.gff") + " is " "temporary generated.\n") def _get_protein_seq(self, gff, tmp_path, tran_path, args_sub, log): prefix = gff.replace(".gff", "") fasta = self.helper.get_correct_file(self.fasta_path, ".fa", prefix, None, None) dna_seq_file = os.path.join(tmp_path, "_".join([prefix, "dna.fa"])) print("Generating CDS fasta files of {0}".format(prefix)) if tran_path is not None: log.write("Predicting subcellular localization for expressed " "CDSs for {0}.\n".format(prefix)) self._compare_cds_tran(os.path.join(self.gff_path, gff), os.path.join(tran_path, "_".join([ prefix, "transcript.gff"])), log) log.write("Running helper.py to extract sequences for CDSs.\n") self.helper.get_cds_seq(os.path.join(self.out_all, "tmp_cds.gff"), fasta, dna_seq_file) os.remove(os.path.join(self.out_all, "tmp_cds.gff")) else: log.write("Predicting subcellular localization for all CDSs for " "{0}.\n".format(prefix)) log.write("Running helper.py to extract sequences for CDSs.\n") self.helper.get_cds_seq(os.path.join(self.gff_path, gff), fasta, dna_seq_file) log.write("\t" + dna_seq_file + " is generated.\n") print("Transfering DNA sequences to protein sequence of {0}".format( prefix)) log.write("Running helper.py to translate DNA sequences to Protein " "sequences.\n") tmp_file = os.path.join(args_sub.out_folder, "tmp") self.helper.translation(dna_seq_file, tmp_file) prot_seq_file = os.path.join( tmp_path, "_".join([prefix, "protein.fa"])) self.fixer.fix_emboss(tmp_file, prot_seq_file) log.write(prot_seq_file + " is generated.\n") os.remove(tmp_file) return prefix def _psortb(self, psortb_path, strain_type, prot_seq_file, out_raw, out_err, log): log.write(" ".join([psortb_path, strain_type, prot_seq_file]) + "\n") call([psortb_path, strain_type, prot_seq_file], stdout=out_raw, stderr=out_err) def _run_psortb(self, args_sub, prefix, out_folder, tmp_path, tmp_result, log): print("Running psortb of {0}".format(prefix)) log.write("Running Psortb for predict subcellular localization for " "{0}.\n".format(prefix)) out_err = open(os.path.join(out_folder, "tmp_log"), "w") out_raw = open(os.path.join(tmp_result, "_".join([prefix, self.endfix_raw])), "w") prot_seq_file = os.path.join(tmp_path, "_".join([prefix, "protein.fa"])) if args_sub.gram == "positive": self._psortb(args_sub.psortb_path, "-p", prot_seq_file, out_raw, out_err, log) elif args_sub.gram == "negative": self._psortb(args_sub.psortb_path, "-n", prot_seq_file, out_raw, out_err, log) else: log.write("Please assign \"positive\" or \"negative\" to " "--bacteria_type.\n") print("Error: {0} is not a proper bacteria type! " "Please assign positive or negative.".format( args_sub.gram)) sys.exit() log.write("\t" + os.path.join(tmp_result, "_".join([ prefix, self.endfix_raw])) + " is temporary generated.\n") out_err.close() out_raw.close() def _extract_result(self, args_sub, tmp_psortb_path, prefix, gff_file, log): '''extract the result of psortb''' log.write("Running extract_psortb.py to extract the information of " "localization.\n") extract_psortb(os.path.join( tmp_psortb_path, "_".join([prefix, self.endfix_raw])), os.path.join(tmp_psortb_path, "_".join([ prefix, self.endfix_table])), None, None, args_sub.fuzzy) log.write("\t" + os.path.join(tmp_psortb_path, "_".join([ prefix, self.endfix_table])) + " is tempoaray generated.\n") def _remove_header(self, out_all): out = open(out_all + "_tmp", "w") fh = open(out_all, "r") out.write("\t".join(["#Genome", "Protein", "Strand", "Start", "End", "Location", "Score"]) + "\n") for row in csv.reader(fh, delimiter='\t'): if row[0] != "#Genome": out.write("\t".join(row) + "\n") out.close() fh.close() shutil.move(out_all + "_tmp", out_all) def _merge_and_stat(self, gffs, tmp_psortb_path, stat_path, psortb_result, log): for folder in os.listdir(gffs): if folder.endswith(".gff_folder"): prefix = folder.replace(".gff_folder", "") self.helper.check_make_folder( os.path.join(psortb_result, prefix)) merge_table = os.path.join( psortb_result, prefix, "_".join([prefix, self.endfix_table])) for gff in os.listdir(os.path.join(gffs, folder)): result = self.helper.get_correct_file( tmp_psortb_path, "_" + self.endfix_raw, gff.replace(".gff", ""), None, None) shutil.copy(result, os.path.join(psortb_result, prefix)) result = self.helper.get_correct_file( tmp_psortb_path, "_" + self.endfix_table, gff.replace(".gff", ""), None, None) self.helper.merge_file(result, merge_table) log.write("\t" + merge_table + "\n") self._remove_header(merge_table) self.helper.check_make_folder(os.path.join(stat_path, prefix)) stat_folder = os.path.join(stat_path, prefix) stat_file = os.path.join(stat_folder, "_".join([ "stat", prefix, "sublocal.csv"])) stat_sublocal(merge_table, os.path.join(stat_folder, prefix), stat_file) for file_ in os.listdir(stat_folder): log.write("\t" + os.path.join(stat_folder, file_) + "\n") def _remove_tmps(self, args_sub): self.helper.remove_tmp_dir(args_sub.fastas) self.helper.remove_tmp_dir(args_sub.gffs) self.helper.remove_all_content(args_sub.out_folder, "tmp", "dir") self.helper.remove_all_content(self.out_all, "tmp", "dir") self.helper.remove_all_content(self.out_express, "tmp", "dir") os.remove(os.path.join(self.out_all, "tmp_log")) if args_sub.trans is not None: os.remove(os.path.join(self.out_express, "tmp_log")) self.helper.remove_tmp_dir(args_sub.trans) def run_sub_local(self, args_sub, log): for gff in os.listdir(args_sub.gffs): if gff.endswith(".gff"): self.helper.check_uni_attributes(os.path.join( args_sub.gffs, gff)) self.multiparser.parser_gff(args_sub.gffs, None) self.multiparser.parser_fasta(args_sub.fastas) if args_sub.trans is not None: self.multiparser.parser_gff(args_sub.trans, "transcript") self.helper.check_make_folder(self.express_tmp_path) self.helper.check_make_folder(self.express_tmp_result) self.helper.check_make_folder(self.all_tmp_path) self.helper.check_make_folder(self.all_tmp_result) for gff in os.listdir(self.gff_path): if args_sub.trans is not None: print("Running expressed genes now") prefix = self._get_protein_seq(gff, self.express_tmp_path, self.tran_path, args_sub, log) self._run_psortb(args_sub, prefix, self.out_express, self.express_tmp_path, self.express_tmp_result, log) self._extract_result(args_sub, self.express_tmp_result, prefix, os.path.join(self.gff_path, gff), log) print("Running all genes now") prefix = self._get_protein_seq(gff, self.all_tmp_path, None, args_sub, log) self._run_psortb(args_sub, prefix, self.out_all, self.all_tmp_path, self.all_tmp_result, log) self._extract_result(args_sub, self.all_tmp_result, prefix, os.path.join(self.gff_path, gff), log) log.write("Running stat_sublocal.py to do statistics, generate " "merged tables, and plot figures.\n") log.write("The following files are generated:\n") self._merge_and_stat(args_sub.gffs, self.all_tmp_result, self.all_stat_path, self.all_result, log) if args_sub.trans is not None: self._merge_and_stat(args_sub.gffs, self.express_tmp_result, self.express_stat_path, self.express_result, log) self._remove_tmps(args_sub)
class RATT(object): '''annotation transfer''' def __init__(self, args_ratt): self.multiparser = Multiparser() self.converter = Converter() self.format_fixer = FormatFixer() self.helper = Helper() if args_ratt.ref_gbk: self.gbk = os.path.join(args_ratt.ref_gbk, "gbk_tmp") self.gbk_tmp = os.path.join(self.gbk, "tmp") self.embl = os.path.join(args_ratt.ref_gbk, "embls") if args_ratt.ref_embls: self.embl = args_ratt.ref_embls self.ratt_log = os.path.join(args_ratt.output_path, "ratt_log.txt") self.tmp_files = {"tar": os.path.join(args_ratt.tar_fastas, "tmp"), "ref": os.path.join(args_ratt.ref_fastas, "tmp"), "out_gff": os.path.join(args_ratt.gff_outfolder, "tmp"), "gff": os.path.join(args_ratt.gff_outfolder, "tmp.gff"), "ptt": os.path.join(args_ratt.gff_outfolder, "tmp.ptt"), "rnt": os.path.join(args_ratt.gff_outfolder, "tmp.rnt")} def _convert_to_pttrnt(self, gffs, files, log): for gff in files: if gff.endswith(".gff"): gff = os.path.join(gffs, gff) filename = gff.split("/") prefix = filename[-1][:-4] rnt = gff[:-3] + "rnt" ptt = gff[:-3] + "ptt" fasta = self.helper.get_correct_file(self.tmp_files["tar"], ".fa", prefix, None, None) if fasta: self.converter.convert_gff2rntptt(gff, fasta, ptt, rnt, None, None) log.write("\t" + ptt + " is generated.\n") log.write("\t" + rnt + " is generated.\n") def _remove_files(self, args_ratt, out_gbk, log): self.helper.remove_all_content(args_ratt.gff_outfolder, ".gff", "file") self.helper.remove_all_content(args_ratt.gff_outfolder, ".ptt", "file") self.helper.remove_all_content(args_ratt.gff_outfolder, ".rnt", "file") log.write("Moving the final output files to {0}.\n".format(args_ratt.gff_outfolder)) self.helper.move_all_content(self.tmp_files["out_gff"], args_ratt.gff_outfolder, None) log.write("Remove the temperary files.\n") shutil.rmtree(self.tmp_files["out_gff"]) shutil.rmtree(self.tmp_files["tar"]) shutil.rmtree(self.tmp_files["ref"]) self.helper.remove_tmp_dir(args_ratt.tar_fastas) self.helper.remove_tmp_dir(args_ratt.ref_fastas) self.helper.remove_tmp_dir(args_ratt.ref_embls) self.helper.remove_tmp_dir(args_ratt.ref_gbk) def _convert_to_gff(self, ratt_result, args_ratt, files, log): name = ratt_result.split(".") filename = ".".join(name[1:-2]) + ".gff" output_file = os.path.join(args_ratt.output_path, filename) self.converter.convert_embl2gff( os.path.join(args_ratt.output_path, ratt_result), output_file) self.format_fixer.fix_ratt(output_file, ".".join(name[1:-2]), "tmp_gff") shutil.move("tmp_gff", output_file) shutil.copy(output_file, os.path.join(args_ratt.gff_outfolder, filename)) log.write("\t" + os.path.join(args_ratt.gff_outfolder, filename) + " is generated.\n") files.append(filename) def _parser_embl_gbk(self, files): self.helper.check_make_folder(self.gbk) for file_ in files: close = False with open(file_, "r") as f_h: for line in f_h: if (line.startswith("LOCUS")): out = open(self.gbk_tmp, "w") datas = line.split(" ") for data in datas: if (len(data) != 0) and (data != "LOCUS"): filename = ".".join([data.strip(), "gbk"]) break elif (line.startswith("VERSION")): datas = line.split(" ") for data in datas: if (len(data) != 0) and (data != "VERSION"): new_filename = ".".join([data.strip(), "gbk"]) break if new_filename.find(filename): filename = new_filename if out: out.write(line) if line.startswith("//"): out.close() close = True shutil.move(self.gbk_tmp, os.path.join(self.gbk, filename)) if not close: out.close() return self.gbk def _convert_embl(self, ref_embls, log): '''convert gbk to embl''' detect_gbk = False gbks = [] out_gbk = None for embl in os.listdir(ref_embls): if (embl.endswith(".gbk")) or ( embl.endswith(".gbff")) or ( embl.endswith(".gb")): detect_gbk = True gbks.append(os.path.join(ref_embls, embl)) if not detect_gbk: log.write("--related_gbk_files is assigned, but not gbk files are detected.\n" "The gbk file names need to be ended at .gbk, .gb, or .gbff. \n") print("Error: Please assign proper Genebank files!") sys.exit() elif detect_gbk: out_gbk = self._parser_embl_gbk(gbks) log.write("Running converter.py to convert gbk file to embl format.\n") self.converter.convert_gbk2embl(out_gbk) self.helper.check_make_folder(self.embl) self.helper.move_all_content(out_gbk, self.embl, [".embl"]) log.write("\t" + self.embl + " is generated and the embl files are stored in it.\n") return out_gbk def _run_ratt(self, args_ratt, tar, ref, out, log): if (not os.path.exists(self.embl)) or ( not os.path.exists(os.path.join( self.tmp_files["tar"], tar + ".fa"))) or ( not os.path.exists(os.path.join( self.tmp_files["ref"], ref + ".fa"))): print("Error: Please check --compare_pair, the strain names " "should be the same as the strain names in fasta, " "genbank or embl files!") log.write("The strain names in --compare_pair should be the same " "as the strain names in fasta, genbank, or embl files.\n") sys.exit() log.write("Make sure your RATT version is at least 1.64.\n") log.write("If the RATT can not run properly, please check the " "RATT_HOME and PAGIT_HOME is assigned correctly.\n") log.write(" ".join([args_ratt.ratt_path, self.embl, os.path.join(self.tmp_files["tar"], tar + ".fa"), args_ratt.element, args_ratt.transfer_type, os.path.join(self.tmp_files["ref"], ref + ".fa")]) + "\n") call([args_ratt.ratt_path, self.embl, os.path.join(self.tmp_files["tar"], tar + ".fa"), args_ratt.element, args_ratt.transfer_type, os.path.join(self.tmp_files["ref"], ref + ".fa")], stdout=out, stderr=DEVNULL) log.write("Done!\n") def _format_and_run(self, args_ratt, log): print("Running RATT") for pair in args_ratt.pairs: ref = pair.split(":")[0] tar = pair.split(":")[1] out = open(self.ratt_log, "w+") self._run_ratt(args_ratt, tar, ref, out, log) log.write("The following files are generatd:\n") for filename in os.listdir(): if ("final" in filename): log.write("\t" + filename + "\n") shutil.move(filename, os.path.join(args_ratt.output_path, filename)) elif (args_ratt.element in filename) or ( "query" in filename) or ( "Reference" in filename) or ( "Query" in filename) or ( "Sequences" in filename): log.write("\t" + filename + "\n") if os.path.isfile(filename): os.remove(filename) if os.path.isdir(filename): shutil.rmtree(filename) out.close() def annotation_transfer(self, args_ratt, log): self.multiparser.parser_fasta(args_ratt.tar_fastas) self.multiparser.parser_fasta(args_ratt.ref_fastas) out_gbk = None if args_ratt.ref_embls is None: out_gbk = self._convert_embl(args_ratt.ref_gbki, log) self._format_and_run(args_ratt, log) files = [] for data in os.listdir(args_ratt.output_path): if "final.embl" in data: log.write("Running converter.py to convert embl " "files in {0} to gff, ptt, and rnt format.\n".format(data)) self._convert_to_gff(data, args_ratt, files, log) self._convert_to_pttrnt(args_ratt.gff_outfolder, files, log) self.helper.check_make_folder(self.tmp_files["out_gff"]) log.write("Merging the output of {0}.\n".format(data)) for folder in os.listdir(args_ratt.tar_fastas): files = [] if "_folder" in folder: datas = folder.split("_folder") prefix = ".".join(datas[0].split(".")[:-1]) for file_ in os.listdir(os.path.join(args_ratt.tar_fastas, folder)): files.append(file_[:-3]) for gff in os.listdir(args_ratt.gff_outfolder): for file_ in files: if (".gff" in gff) and (file_ == gff[:-4]): self.helper.merge_file(os.path.join( args_ratt.gff_outfolder, gff), self.tmp_files["gff"]) if (".ptt" in gff) and (file_ == gff[:-4]): self.helper.merge_file(os.path.join( args_ratt.gff_outfolder, gff), self.tmp_files["ptt"]) if (".rnt" in gff) and (file_ == gff[:-4]): self.helper.merge_file(os.path.join( args_ratt.gff_outfolder, gff), self.tmp_files["rnt"]) if os.path.exists(self.tmp_files["gff"]): shutil.move(self.tmp_files["gff"], os.path.join( self.tmp_files["out_gff"], prefix + ".gff")) shutil.move(self.tmp_files["ptt"], os.path.join( self.tmp_files["out_gff"], prefix + ".ptt")) shutil.move(self.tmp_files["rnt"], os.path.join( self.tmp_files["out_gff"], prefix + ".rnt")) else: print("Error: Please check your fasta or " "annotation files, they should only contain " "the query genome. And make sure your RATT can " "work properly (check $ANNOgesic/output/" "annotation_transfer/ratt_log.txt).") log.write("Please check your fasta or " "annotation files, they should only contain " "the query genome. And make sure your RATT can " "work properly (check $ANNOgesic/output/" "annotation_transfer/ratt_log.txt).\n") self._remove_files(args_ratt, out_gbk, log)
class SubLocal(object): '''detection of subcellular localization''' def __init__(self, args_sub): self.multiparser = Multiparser() self.helper = Helper() self.fixer = FormatFixer() self.gff_path = os.path.join(args_sub.gffs, "tmp") self.fasta_path = os.path.join(args_sub.fastas, "tmp") if args_sub.trans is not None: self.tran_path = os.path.join(args_sub.trans, "tmp") else: self.tran_path = None self.out_all = os.path.join(args_sub.out_folder, "all_CDS") self.out_express = os.path.join(args_sub.out_folder, "expressed_CDS") self.all_tmp_path = os.path.join(self.out_all, "tmp") self.express_tmp_path = os.path.join(self.out_express, "tmp") self.all_stat_path = os.path.join(self.out_all, "statistics") self.express_stat_path = os.path.join(self.out_express, "statistics") self.all_tmp_result = os.path.join(self.out_all, "tmp_results") self.express_tmp_result = os.path.join(self.out_express, "tmp_results") self.all_result = os.path.join(self.out_all, "psortb_results") self.express_result = os.path.join(self.out_express, "psortb_results") self.endfix_table = "table.csv" self.endfix_raw = "raw.txt" self._make_folder() def _make_folder(self): self.helper.check_make_folder(self.out_all) self.helper.check_make_folder(self.out_express) self.helper.check_make_folder(self.all_stat_path) self.helper.check_make_folder(self.express_stat_path) self.helper.check_make_folder(self.all_result) self.helper.check_make_folder(self.express_result) def _compare_cds_tran(self, gff_file, tran_file): '''compare CDS and transcript to find the expressed CDS''' out = open(os.path.join(self.out_all, "tmp_cds.gff"), "w") cdss = [] fh = open(gff_file) th = open(tran_file) for entry in Gff3Parser().entries(fh): if entry.feature == "CDS": cdss.append(entry) trans = [] for entry in Gff3Parser().entries(th): trans.append(entry) for cds in cdss: for ta in trans: if (cds.strand == ta.strand) and ( cds.seq_id == ta.seq_id): if ((cds.end < ta.end) and ( cds.end > ta.start) and ( cds.start <= ta.start)) or ( (cds.start > ta.start) and ( cds.start < ta.end) and ( cds.end >= ta.end)) or ( (cds.end >= ta.end) and ( cds.start <= ta.start)) or ( (cds.end <= ta.end) and ( cds.start >= ta.start)): out.write(cds.info + "\n") break fh.close() th.close() out.close() def _get_protein_seq(self, gff, tmp_path, tran_path): prefix = gff.replace(".gff", "") fasta = self.helper.get_correct_file(self.fasta_path, ".fa", prefix, None, None) dna_seq_file = os.path.join(tmp_path, "_".join([prefix, "dna.fa"])) print("Generating CDS fasta files of {0}".format(prefix)) if tran_path is not None: self._compare_cds_tran(os.path.join(self.gff_path, gff), os.path.join(tran_path, "_".join([ prefix, "transcript.gff"]))) self.helper.get_cds_seq(os.path.join(self.out_all, "tmp_cds.gff"), fasta, dna_seq_file) os.remove(os.path.join(self.out_all, "tmp_cds.gff")) else: self.helper.get_cds_seq(os.path.join(self.gff_path, gff), fasta, dna_seq_file) print("Transfering DNA seq to protein seq of {0}".format(prefix)) self.helper.translation(dna_seq_file, "tmp") prot_seq_file = os.path.join( tmp_path, "_".join([prefix, "protein.fa"])) self.fixer.fix_emboss("tmp", prot_seq_file) os.remove("tmp") return prefix def _psortb(self, psortb_path, strain_type, prot_seq_file, out_raw, out_err): call([psortb_path, strain_type, prot_seq_file], stdout=out_raw, stderr=out_err) def _run_psortb(self, args_sub, prefix, out_folder, tmp_path, tmp_result): print("Running psortb of {0}".format(prefix)) out_err = open(os.path.join(out_folder, "tmp_log"), "w") out_raw = open(os.path.join(tmp_result, "_".join([prefix, self.endfix_raw])), "w") prot_seq_file = os.path.join(tmp_path, "_".join([prefix, "protein.fa"])) if args_sub.gram == "positive": self._psortb(args_sub.psortb_path, "-p", prot_seq_file, out_raw, out_err) elif args_sub.gram == "negative": self._psortb(args_sub.psortb_path, "-n", prot_seq_file, out_raw, out_err) else: print("Error: It is not a proper bacteria type - {0}!!".format( args_sub.gram)) sys.exit() out_err.close() out_raw.close() def _extract_result(self, args_sub, tmp_psortb_path, prefix, gff_file): '''extract the result of psortb''' if args_sub.merge: print("Merging gff") extract_psortb(os.path.join( tmp_psortb_path, "_".join([prefix, self.endfix_raw])), os.path.join(tmp_psortb_path, "_".join([ prefix, self.endfix_table])), gff_file, os.path.join(prefix + ".gff"), args_sub.fuzzy) shutil.move(prefix + ".gff", gff_file) else: extract_psortb(os.path.join( tmp_psortb_path, "_".join([prefix, self.endfix_raw])), os.path.join(tmp_psortb_path, "_".join([ prefix, self.endfix_table])), None, None, args_sub.fuzzy) def _merge_and_stat(self, gffs, tmp_psortb_path, stat_path, psortb_result): for folder in os.listdir(gffs): if folder.endswith(".gff_folder"): prefix = folder.replace(".gff_folder", "") self.helper.check_make_folder( os.path.join(psortb_result, prefix)) merge_table = os.path.join( psortb_result, prefix, "_".join([prefix, self.endfix_table])) for gff in os.listdir(os.path.join(gffs, folder)): result = self.helper.get_correct_file( tmp_psortb_path, "_" + self.endfix_raw, gff.replace(".gff", ""), None, None) shutil.copy(result, os.path.join(psortb_result, prefix)) result = self.helper.get_correct_file( tmp_psortb_path, "_" + self.endfix_table, gff.replace(".gff", ""), None, None) self.helper.merge_file(result, merge_table) self.helper.check_make_folder(os.path.join(stat_path, prefix)) stat_sublocal(merge_table, os.path.join( stat_path, prefix, prefix), os.path.join( stat_path, prefix, "_".join([ "stat", prefix, "sublocal.csv"]))) def _remove_tmps(self, args_sub): self.helper.remove_tmp_dir(args_sub.fastas) self.helper.remove_tmp_dir(args_sub.gffs) self.helper.remove_all_content(args_sub.out_folder, "tmp", "dir") self.helper.remove_all_content(self.out_all, "tmp", "dir") self.helper.remove_all_content(self.out_express, "tmp", "dir") os.remove(os.path.join(self.out_all, "tmp_log")) if args_sub.trans is not None: os.remove(os.path.join(self.out_express, "tmp_log")) self.helper.remove_tmp_dir(args_sub.trans) def run_sub_local(self, args_sub): for gff in os.listdir(args_sub.gffs): if gff.endswith(".gff"): self.helper.check_uni_attributes(os.path.join( args_sub.gffs, gff)) self.multiparser.parser_gff(args_sub.gffs, None) self.multiparser.parser_fasta(args_sub.fastas) if args_sub.trans is not None: self.multiparser.parser_gff(args_sub.trans, "transcript") self.helper.check_make_folder(self.express_tmp_path) self.helper.check_make_folder(self.express_tmp_result) self.helper.check_make_folder(self.all_tmp_path) self.helper.check_make_folder(self.all_tmp_result) for gff in os.listdir(self.gff_path): if args_sub.trans is not None: print("Running expressed gene now") prefix = self._get_protein_seq(gff, self.express_tmp_path, self.tran_path) self._run_psortb(args_sub, prefix, self.out_express, self.express_tmp_path, self.express_tmp_result) self._extract_result(args_sub, self.express_tmp_result, prefix, os.path.join(self.gff_path, gff)) print("Running all gene now") prefix = self._get_protein_seq(gff, self.all_tmp_path, None) self._run_psortb(args_sub, prefix, self.out_all, self.all_tmp_path, self.all_tmp_result) self._extract_result(args_sub, self.all_tmp_result, prefix, os.path.join(self.gff_path, gff)) self._merge_and_stat(args_sub.gffs, self.all_tmp_result, self.all_stat_path, self.all_result) if args_sub.trans is not None: self._merge_and_stat(args_sub.gffs, self.express_tmp_result, self.express_stat_path, self.express_result) self._remove_tmps(args_sub)
class SubLocal(object): '''detection of subcellular localization''' def __init__(self, args_sub): self.multiparser = Multiparser() self.helper = Helper() self.fixer = FormatFixer() self.gff_path = os.path.join(args_sub.gffs, "tmp") self.fasta_path = os.path.join(args_sub.fastas, "tmp") if args_sub.trans is not None: self.tran_path = os.path.join(args_sub.trans, "tmp") else: self.tran_path = None self.out_all = os.path.join(args_sub.out_folder, "all_CDSs") self.out_express = os.path.join(args_sub.out_folder, "expressed_CDSs") self.all_tmp_path = os.path.join(self.out_all, "tmp") self.express_tmp_path = os.path.join(self.out_express, "tmp") self.all_stat_path = os.path.join(self.out_all, "statistics") self.express_stat_path = os.path.join(self.out_express, "statistics") self.all_tmp_result = os.path.join(self.out_all, "tmp_results") self.express_tmp_result = os.path.join(self.out_express, "tmp_results") self.all_result = os.path.join(self.out_all, "psortb_results") self.express_result = os.path.join(self.out_express, "psortb_results") self.endfix_table = "table.csv" self.endfix_raw = "raw.txt" self._make_folder() def _make_folder(self): self.helper.check_make_folder(self.out_all) self.helper.check_make_folder(self.out_express) self.helper.check_make_folder(self.all_stat_path) self.helper.check_make_folder(self.express_stat_path) self.helper.check_make_folder(self.all_result) self.helper.check_make_folder(self.express_result) def _compare_cds_tran(self, gff_file, tran_file, log): '''compare CDS and transcript to find the expressed CDS''' log.write("Comparing transcripts and CDSs to get expressed CDSs.\n") out = open(os.path.join(self.out_all, "tmp_cds.gff"), "w") cdss = [] fh = open(gff_file) th = open(tran_file) for entry in Gff3Parser().entries(fh): if entry.feature == "CDS": cdss.append(entry) trans = [] for entry in Gff3Parser().entries(th): trans.append(entry) for cds in cdss: for ta in trans: if (cds.strand == ta.strand) and (cds.seq_id == ta.seq_id): if ((cds.end < ta.end) and (cds.end > ta.start) and (cds.start <= ta.start)) or ( (cds.start > ta.start) and (cds.start < ta.end) and (cds.end >= ta.end)) or ( (cds.end >= ta.end) and (cds.start <= ta.start)) or ( (cds.end <= ta.end) and (cds.start >= ta.start)): out.write(cds.info + "\n") break fh.close() th.close() out.close() log.write("\t" + os.path.join(self.out_all, "tmp_cds.gff") + " is " "temporary generated.\n") def _get_protein_seq(self, gff, tmp_path, tran_path, args_sub, log): prefix = gff.replace(".gff", "") fasta = self.helper.get_correct_file(self.fasta_path, ".fa", prefix, None, None) dna_seq_file = os.path.join(tmp_path, "_".join([prefix, "dna.fa"])) print("Generating CDS fasta files of {0}".format(prefix)) if tran_path is not None: log.write("Predicting subcellular localization for expressed " "CDSs for {0}.\n".format(prefix)) self._compare_cds_tran( os.path.join(self.gff_path, gff), os.path.join(tran_path, "_".join([prefix, "transcript.gff"])), log) log.write("Running helper.py to extract sequences for CDSs.\n") self.helper.get_cds_seq(os.path.join(self.out_all, "tmp_cds.gff"), fasta, dna_seq_file) os.remove(os.path.join(self.out_all, "tmp_cds.gff")) else: log.write("Predicting subcellular localization for all CDSs for " "{0}.\n".format(prefix)) log.write("Running helper.py to extract sequences for CDSs.\n") self.helper.get_cds_seq(os.path.join(self.gff_path, gff), fasta, dna_seq_file) log.write("\t" + dna_seq_file + " is generated.\n") print("Transfering DNA sequences to protein sequence of {0}".format( prefix)) log.write("Running helper.py to translate DNA sequences to Protein " "sequences.\n") tmp_file = os.path.join(args_sub.out_folder, "tmp") self.helper.translation(dna_seq_file, tmp_file) prot_seq_file = os.path.join(tmp_path, "_".join([prefix, "protein.fa"])) self.fixer.fix_emboss(tmp_file, prot_seq_file) log.write(prot_seq_file + " is generated.\n") os.remove(tmp_file) return prefix def _psortb(self, psortb_path, strain_type, prot_seq_file, out_raw, out_err, log): log.write(" ".join([psortb_path, strain_type, prot_seq_file]) + "\n") call([psortb_path, strain_type, prot_seq_file], stdout=out_raw, stderr=out_err) def _run_psortb(self, args_sub, prefix, out_folder, tmp_path, tmp_result, log): print("Running psortb of {0}".format(prefix)) log.write("Running Psortb for predict subcellular localization for " "{0}.\n".format(prefix)) out_err = open(os.path.join(out_folder, "tmp_log"), "w") out_raw = open( os.path.join(tmp_result, "_".join([prefix, self.endfix_raw])), "w") prot_seq_file = os.path.join(tmp_path, "_".join([prefix, "protein.fa"])) if args_sub.gram == "positive": self._psortb(args_sub.psortb_path, "-p", prot_seq_file, out_raw, out_err, log) elif args_sub.gram == "negative": self._psortb(args_sub.psortb_path, "-n", prot_seq_file, out_raw, out_err, log) else: log.write("Please assign \"positive\" or \"negative\" to " "--bacteria_type.\n") print("Error: {0} is not a proper bacteria type! " "Please assign positive or negative.".format(args_sub.gram)) sys.exit() log.write( "\t" + os.path.join(tmp_result, "_".join([prefix, self.endfix_raw])) + " is temporary generated.\n") out_err.close() out_raw.close() def _extract_result(self, args_sub, tmp_psortb_path, prefix, gff_file, log): '''extract the result of psortb''' log.write("Running extract_psortb.py to extract the information of " "localization.\n") extract_psortb( os.path.join(tmp_psortb_path, "_".join([prefix, self.endfix_raw])), os.path.join(tmp_psortb_path, "_".join([prefix, self.endfix_table])), None, None, args_sub.fuzzy) log.write("\t" + os.path.join(tmp_psortb_path, "_".join( [prefix, self.endfix_table])) + " is tempoaray generated.\n") def _remove_header(self, out_all): out = open(out_all + "_tmp", "w") fh = open(out_all, "r") out.write("\t".join([ "#Genome", "Protein", "Strand", "Start", "End", "Location", "Score" ]) + "\n") for row in csv.reader(fh, delimiter='\t'): if row[0] != "#Genome": out.write("\t".join(row) + "\n") out.close() fh.close() shutil.move(out_all + "_tmp", out_all) def _merge_and_stat(self, gffs, tmp_psortb_path, stat_path, psortb_result, log): for folder in os.listdir(gffs): if folder.endswith(".gff_folder"): prefix = folder.replace(".gff_folder", "") self.helper.check_make_folder( os.path.join(psortb_result, prefix)) merge_table = os.path.join( psortb_result, prefix, "_".join([prefix, self.endfix_table])) for gff in os.listdir(os.path.join(gffs, folder)): result = self.helper.get_correct_file( tmp_psortb_path, "_" + self.endfix_raw, gff.replace(".gff", ""), None, None) shutil.copy(result, os.path.join(psortb_result, prefix)) result = self.helper.get_correct_file( tmp_psortb_path, "_" + self.endfix_table, gff.replace(".gff", ""), None, None) self.helper.merge_file(result, merge_table) log.write("\t" + merge_table + "\n") self._remove_header(merge_table) self.helper.check_make_folder(os.path.join(stat_path, prefix)) stat_folder = os.path.join(stat_path, prefix) stat_file = os.path.join( stat_folder, "_".join(["stat", prefix, "sublocal.csv"])) stat_sublocal(merge_table, os.path.join(stat_folder, prefix), stat_file) for file_ in os.listdir(stat_folder): log.write("\t" + os.path.join(stat_folder, file_) + "\n") def _remove_tmps(self, args_sub): self.helper.remove_tmp_dir(args_sub.fastas) self.helper.remove_tmp_dir(args_sub.gffs) self.helper.remove_all_content(args_sub.out_folder, "tmp", "dir") self.helper.remove_all_content(self.out_all, "tmp", "dir") self.helper.remove_all_content(self.out_express, "tmp", "dir") os.remove(os.path.join(self.out_all, "tmp_log")) if args_sub.trans is not None: os.remove(os.path.join(self.out_express, "tmp_log")) self.helper.remove_tmp_dir(args_sub.trans) def run_sub_local(self, args_sub, log): for gff in os.listdir(args_sub.gffs): if gff.endswith(".gff"): self.helper.check_uni_attributes( os.path.join(args_sub.gffs, gff)) self.multiparser.parser_gff(args_sub.gffs, None) self.multiparser.parser_fasta(args_sub.fastas) if args_sub.trans is not None: self.multiparser.parser_gff(args_sub.trans, "transcript") self.helper.check_make_folder(self.express_tmp_path) self.helper.check_make_folder(self.express_tmp_result) self.helper.check_make_folder(self.all_tmp_path) self.helper.check_make_folder(self.all_tmp_result) for gff in os.listdir(self.gff_path): if args_sub.trans is not None: print("Running expressed genes now") prefix = self._get_protein_seq(gff, self.express_tmp_path, self.tran_path, args_sub, log) self._run_psortb(args_sub, prefix, self.out_express, self.express_tmp_path, self.express_tmp_result, log) self._extract_result(args_sub, self.express_tmp_result, prefix, os.path.join(self.gff_path, gff), log) print("Running all genes now") prefix = self._get_protein_seq(gff, self.all_tmp_path, None, args_sub, log) self._run_psortb(args_sub, prefix, self.out_all, self.all_tmp_path, self.all_tmp_result, log) self._extract_result(args_sub, self.all_tmp_result, prefix, os.path.join(self.gff_path, gff), log) log.write("Running stat_sublocal.py to do statistics, generate " "merged tables, and plot figures.\n") log.write("The following files are generated:\n") self._merge_and_stat(args_sub.gffs, self.all_tmp_result, self.all_stat_path, self.all_result, log) if args_sub.trans is not None: self._merge_and_stat(args_sub.gffs, self.express_tmp_result, self.express_stat_path, self.express_result, log) self._remove_tmps(args_sub)
class RATT(object): '''annotation transfer''' def __init__(self, args_ratt): self.multiparser = Multiparser() self.converter = Converter() self.format_fixer = FormatFixer() self.helper = Helper() if args_ratt.ref_gbk: self.gbk = os.path.join(args_ratt.ref_gbk, "gbk_tmp") self.gbk_tmp = os.path.join(self.gbk, "tmp") self.embl = os.path.join(args_ratt.ref_gbk, "embls") if args_ratt.ref_embls: self.embl = args_ratt.ref_embls self.ratt_log = os.path.join(args_ratt.output_path, "ratt_log.txt") self.tmp_files = { "tar": os.path.join(args_ratt.tar_fastas, "tmp"), "ref": os.path.join(args_ratt.ref_fastas, "tmp"), "out_gff": os.path.join(args_ratt.gff_outfolder, "tmp"), "gff": os.path.join(args_ratt.gff_outfolder, "tmp.gff"), "ptt": os.path.join(args_ratt.gff_outfolder, "tmp.ptt"), "rnt": os.path.join(args_ratt.gff_outfolder, "tmp.rnt") } def _convert_to_pttrnt(self, gffs, files, log): for gff in files: if gff.endswith(".gff"): gff = os.path.join(gffs, gff) filename = gff.split("/") prefix = filename[-1][:-4] rnt = gff[:-3] + "rnt" ptt = gff[:-3] + "ptt" fasta = self.helper.get_correct_file(self.tmp_files["tar"], ".fa", prefix, None, None) if fasta: self.converter.convert_gff2rntptt(gff, fasta, ptt, rnt, None, None) log.write("\t" + ptt + " is generated.\n") log.write("\t" + rnt + " is generated.\n") def _remove_files(self, args_ratt, out_gbk, log): self.helper.remove_all_content(args_ratt.gff_outfolder, ".gff", "file") self.helper.remove_all_content(args_ratt.gff_outfolder, ".ptt", "file") self.helper.remove_all_content(args_ratt.gff_outfolder, ".rnt", "file") log.write("Moving the final output files to {0}.\n".format( args_ratt.gff_outfolder)) self.helper.move_all_content(self.tmp_files["out_gff"], args_ratt.gff_outfolder, None) log.write("Remove the temperary files.\n") shutil.rmtree(self.tmp_files["out_gff"]) shutil.rmtree(self.tmp_files["tar"]) shutil.rmtree(self.tmp_files["ref"]) self.helper.remove_tmp_dir(args_ratt.tar_fastas) self.helper.remove_tmp_dir(args_ratt.ref_fastas) self.helper.remove_tmp_dir(args_ratt.ref_embls) self.helper.remove_tmp_dir(args_ratt.ref_gbk) def _convert_to_gff(self, ratt_result, args_ratt, files, log): name = ratt_result.split(".") filename = ".".join(name[1:-2]) + ".gff" output_file = os.path.join(args_ratt.output_path, filename) self.converter.convert_embl2gff( os.path.join(args_ratt.output_path, ratt_result), output_file) self.format_fixer.fix_ratt(output_file, ".".join(name[1:-2]), "tmp_gff") shutil.move("tmp_gff", output_file) shutil.copy(output_file, os.path.join(args_ratt.gff_outfolder, filename)) log.write("\t" + os.path.join(args_ratt.gff_outfolder, filename) + " is generated.\n") files.append(filename) def _parser_embl_gbk(self, files): self.helper.check_make_folder(self.gbk) for file_ in files: close = False with open(file_, "r") as f_h: for line in f_h: if (line.startswith("LOCUS")): out = open(self.gbk_tmp, "w") datas = line.split(" ") for data in datas: if (len(data) != 0) and (data != "LOCUS"): filename = ".".join([data.strip(), "gbk"]) break elif (line.startswith("VERSION")): datas = line.split(" ") for data in datas: if (len(data) != 0) and (data != "VERSION"): new_filename = ".".join([data.strip(), "gbk"]) break if new_filename.find(filename): filename = new_filename if out: out.write(line) if line.startswith("//"): out.close() close = True shutil.move(self.gbk_tmp, os.path.join(self.gbk, filename)) if not close: out.close() return self.gbk def _convert_embl(self, ref_embls, log): '''convert gbk to embl''' detect_gbk = False gbks = [] out_gbk = None for embl in os.listdir(ref_embls): if (embl.endswith(".gbk")) or (embl.endswith(".gbff")) or ( embl.endswith(".gb")): detect_gbk = True gbks.append(os.path.join(ref_embls, embl)) if not detect_gbk: log.write( "--related_gbk_files is assigned, but not gbk files are detected.\n" "The gbk file names need to be ended at .gbk, .gb, or .gbff. \n" ) print("Error: Please assign proper Genebank files!") sys.exit() elif detect_gbk: out_gbk = self._parser_embl_gbk(gbks) log.write( "Running converter.py to convert gbk file to embl format.\n") self.converter.convert_gbk2embl(out_gbk) self.helper.check_make_folder(self.embl) self.helper.move_all_content(out_gbk, self.embl, [".embl"]) log.write("\t" + self.embl + " is generated and the embl files are stored in it.\n") return out_gbk def _run_ratt(self, args_ratt, tar, ref, out, log): if (not os.path.exists(self.embl)) or (not os.path.exists( os.path.join(self.tmp_files["tar"], tar + ".fa"))) or ( not os.path.exists( os.path.join(self.tmp_files["ref"], ref + ".fa"))): print("Error: Please check --compare_pair, the strain names " "should be the same as the strain names in fasta, " "genbank or embl files!") log.write( "The strain names in --compare_pair should be the same " "as the strain names in fasta, genbank, or embl files.\n") sys.exit() log.write("Make sure your RATT version is at least 1.64.\n") log.write("If the RATT can not run properly, please check the " "RATT_HOME and PAGIT_HOME is assigned correctly.\n") log.write(" ".join([ args_ratt.ratt_path, self.embl, os.path.join(self.tmp_files["tar"], tar + ".fa"), args_ratt.element, args_ratt.transfer_type, os.path.join(self.tmp_files["ref"], ref + ".fa") ]) + "\n") call([ args_ratt.ratt_path, self.embl, os.path.join(self.tmp_files["tar"], tar + ".fa"), args_ratt.element, args_ratt.transfer_type, os.path.join(self.tmp_files["ref"], ref + ".fa") ], stdout=out, stderr=DEVNULL) log.write("Done!\n") def _format_and_run(self, args_ratt, log): print("Running RATT") for pair in args_ratt.pairs: ref = pair.split(":")[0] tar = pair.split(":")[1] out = open(self.ratt_log, "w+") self._run_ratt(args_ratt, tar, ref, out, log) log.write("The following files are generatd:\n") for filename in os.listdir(): if ("final" in filename): log.write("\t" + filename + "\n") shutil.move(filename, os.path.join(args_ratt.output_path, filename)) elif (args_ratt.element in filename) or ( "query" in filename) or ("Reference" in filename) or ( "Query" in filename) or ("Sequences" in filename): log.write("\t" + filename + "\n") if os.path.isfile(filename): os.remove(filename) if os.path.isdir(filename): shutil.rmtree(filename) out.close() def annotation_transfer(self, args_ratt, log): self.multiparser.parser_fasta(args_ratt.tar_fastas) self.multiparser.parser_fasta(args_ratt.ref_fastas) out_gbk = None if args_ratt.ref_embls is None: out_gbk = self._convert_embl(args_ratt.ref_gbki, log) self._format_and_run(args_ratt, log) files = [] for data in os.listdir(args_ratt.output_path): if "final.embl" in data: log.write( "Running converter.py to convert embl " "files in {0} to gff, ptt, and rnt format.\n".format(data)) self._convert_to_gff(data, args_ratt, files, log) self._convert_to_pttrnt(args_ratt.gff_outfolder, files, log) self.helper.check_make_folder(self.tmp_files["out_gff"]) log.write("Merging the output of {0}.\n".format(data)) for folder in os.listdir(args_ratt.tar_fastas): files = [] if "_folder" in folder: datas = folder.split("_folder") prefix = ".".join(datas[0].split(".")[:-1]) for file_ in os.listdir( os.path.join(args_ratt.tar_fastas, folder)): files.append(file_[:-3]) for gff in os.listdir(args_ratt.gff_outfolder): for file_ in files: if (".gff" in gff) and (file_ == gff[:-4]): self.helper.merge_file( os.path.join(args_ratt.gff_outfolder, gff), self.tmp_files["gff"]) if (".ptt" in gff) and (file_ == gff[:-4]): self.helper.merge_file( os.path.join(args_ratt.gff_outfolder, gff), self.tmp_files["ptt"]) if (".rnt" in gff) and (file_ == gff[:-4]): self.helper.merge_file( os.path.join(args_ratt.gff_outfolder, gff), self.tmp_files["rnt"]) if os.path.exists(self.tmp_files["gff"]): shutil.move( self.tmp_files["gff"], os.path.join(self.tmp_files["out_gff"], prefix + ".gff")) shutil.move( self.tmp_files["ptt"], os.path.join(self.tmp_files["out_gff"], prefix + ".ptt")) shutil.move( self.tmp_files["rnt"], os.path.join(self.tmp_files["out_gff"], prefix + ".rnt")) else: print("Error: Please check your fasta or " "annotation files, they should only contain " "the query genome. And make sure your RATT can " "work properly (check $ANNOgesic/output/" "annotation_transfer/ratt_log.txt).") log.write("Please check your fasta or " "annotation files, they should only contain " "the query genome. And make sure your RATT can " "work properly (check $ANNOgesic/output/" "annotation_transfer/ratt_log.txt).\n") self._remove_files(args_ratt, out_gbk, log)
class RATT(object): '''annotation transfer''' def __init__(self, args_ratt): self.multiparser = Multiparser() self.converter = Converter() self.format_fixer = FormatFixer() self.helper = Helper() if args_ratt.ref_gbk: self.gbk = os.path.join(args_ratt.ref_gbk, "gbk_tmp") self.gbk_tmp = os.path.join(self.gbk, "tmp") self.embl = os.path.join(args_ratt.ref_gbk, "embls") if args_ratt.ref_embls: self.embl = args_ratt.ref_embls self.ratt_log = os.path.join(args_ratt.output_path, "ratt_log.txt") self.tmp_files = { "tar": os.path.join(args_ratt.tar_fastas, "tmp"), "ref": os.path.join(args_ratt.ref_fastas, "tmp"), "out_gff": os.path.join(args_ratt.gff_outfolder, "tmp"), "gff": os.path.join(args_ratt.gff_outfolder, "tmp.gff"), "ptt": os.path.join(args_ratt.gff_outfolder, "tmp.ptt"), "rnt": os.path.join(args_ratt.gff_outfolder, "tmp.rnt") } def _convert_to_pttrnt(self, gffs, files): for gff in files: if gff.endswith(".gff"): gff = os.path.join(gffs, gff) filename = gff.split("/") prefix = filename[-1][:-4] rnt = gff[:-3] + "rnt" ptt = gff[:-3] + "ptt" fasta = self.helper.get_correct_file(self.tmp_files["tar"], ".fa", prefix, None, None) if fasta: self.converter.convert_gff2rntptt(gff, fasta, ptt, rnt, None, None) def _remove_files(self, args_ratt, out_gbk): self.helper.remove_all_content(args_ratt.gff_outfolder, ".gff", "file") self.helper.remove_all_content(args_ratt.gff_outfolder, ".ptt", "file") self.helper.remove_all_content(args_ratt.gff_outfolder, ".rnt", "file") self.helper.move_all_content(self.tmp_files["out_gff"], args_ratt.gff_outfolder, None) shutil.rmtree(self.tmp_files["out_gff"]) shutil.rmtree(self.tmp_files["tar"]) shutil.rmtree(self.tmp_files["ref"]) self.helper.remove_tmp_dir(args_ratt.tar_fastas) self.helper.remove_tmp_dir(args_ratt.ref_fastas) self.helper.remove_tmp_dir(args_ratt.ref_embls) self.helper.remove_tmp_dir(args_ratt.ref_gbk) def _convert_to_gff(self, ratt_result, args_ratt, files): name = ratt_result.split(".") filename = ".".join(name[1:-2]) + ".gff" output_file = os.path.join(args_ratt.output_path, filename) self.converter.convert_embl2gff( os.path.join(args_ratt.output_path, ratt_result), output_file) self.format_fixer.fix_ratt(output_file, ".".join(name[1:-2]), "tmp_gff") shutil.move("tmp_gff", output_file) shutil.copy(output_file, os.path.join(args_ratt.gff_outfolder, filename)) files.append(filename) def _parser_embl_gbk(self, files): self.helper.check_make_folder(self.gbk) for file_ in files: close = False with open(file_, "r") as f_h: for line in f_h: if (line.startswith("LOCUS")): out = open(self.gbk_tmp, "w") datas = line.split(" ") for data in datas: if (len(data) != 0) and (data != "LOCUS"): filename = ".".join([data, "gbk"]) break elif (line.startswith("VERSION")): datas = line.split(" ") for data in datas: if (len(data) != 0) and (data != "VERSION"): new_filename = ".".join([data, "gbk"]) break if new_filename.find(filename): filename = new_filename if out: out.write(line) if line.startswith("//"): out.close() close = True shutil.move(self.gbk_tmp, os.path.join(self.gbk, filename)) if not close: out.close() return self.gbk def _convert_embl(self, ref_embls): '''convert gbk to embl''' detect_gbk = False gbks = [] out_gbk = None for embl in os.listdir(ref_embls): if (embl.endswith(".gbk")) or (embl.endswith(".gbff")) or ( embl.endswith(".gb")): detect_gbk = True gbks.append(os.path.join(ref_embls, embl)) if not detect_gbk: print("Error: Please assign proper Genebank files!") sys.exit() elif detect_gbk: out_gbk = self._parser_embl_gbk(gbks) self.converter.convert_gbk2embl(out_gbk) self.helper.check_make_folder(self.embl) self.helper.move_all_content(out_gbk, self.embl, [".embl"]) return out_gbk def _run_ratt(self, args_ratt, tar, ref, out): call([ args_ratt.ratt_path, self.embl, os.path.join(self.tmp_files["tar"], tar + ".fa"), args_ratt.element, args_ratt.transfer_type, os.path.join(self.tmp_files["ref"], ref + ".fa") ], stdout=out, stderr=DEVNULL) def _format_and_run(self, args_ratt): print("Running RATT") for pair in args_ratt.pairs: ref = pair.split(":")[0] tar = pair.split(":")[1] out = open(self.ratt_log, "w+") self._run_ratt(args_ratt, tar, ref, out) for filename in os.listdir(): if ("final" in filename): shutil.move(filename, os.path.join(args_ratt.output_path, filename)) elif (args_ratt.element in filename) or ( "query" in filename) or ("Reference" in filename) or ( "Query" in filename) or ("Sequences" in filename): if os.path.isfile(filename): os.remove(filename) if os.path.isdir(filename): shutil.rmtree(filename) out.close() def annotation_transfer(self, args_ratt): self.multiparser.parser_fasta(args_ratt.tar_fastas) self.multiparser.parser_fasta(args_ratt.ref_fastas) out_gbk = None if args_ratt.ref_embls is None: out_gbk = self._convert_embl(args_ratt.ref_gbk) self._format_and_run(args_ratt) if args_ratt.convert: files = [] for data in os.listdir(args_ratt.output_path): if "final.embl" in data: self._convert_to_gff(data, args_ratt, files) self._convert_to_pttrnt(args_ratt.gff_outfolder, files) self.helper.check_make_folder(self.tmp_files["out_gff"]) for folder in os.listdir(args_ratt.tar_fastas): files = [] if "_folder" in folder: datas = folder.split("_folder") prefix = ".".join(datas[0].split(".")[:-1]) for file_ in os.listdir( os.path.join(args_ratt.tar_fastas, folder)): files.append(file_[:-3]) for gff in os.listdir(args_ratt.gff_outfolder): for file_ in files: if (".gff" in gff) and (file_ == gff[:-4]): self.helper.merge_file( os.path.join(args_ratt.gff_outfolder, gff), self.tmp_files["gff"]) if (".ptt" in gff) and (file_ == gff[:-4]): self.helper.merge_file( os.path.join(args_ratt.gff_outfolder, gff), self.tmp_files["ptt"]) if (".rnt" in gff) and (file_ == gff[:-4]): self.helper.merge_file( os.path.join(args_ratt.gff_outfolder, gff), self.tmp_files["rnt"]) if os.path.exists(self.tmp_files["gff"]): shutil.move( self.tmp_files["gff"], os.path.join(self.tmp_files["out_gff"], prefix + ".gff")) shutil.move( self.tmp_files["ptt"], os.path.join(self.tmp_files["out_gff"], prefix + ".ptt")) shutil.move( self.tmp_files["rnt"], os.path.join(self.tmp_files["out_gff"], prefix + ".rnt")) else: print("Error: Please check your fasta or " "annotation files, they should only contain " "the query genome. And make sure your RATT can " "work properly (check $ANNOgesic/output/" "annotation_transfer/ratt_log.txt).") self._remove_files(args_ratt, out_gbk)
class sRNATargetPrediction(object): def __init__(self, args_tar): self.multiparser = Multiparser() self.helper = Helper() self.fixer = FormatFixer() self.gff_parser = Gff3Parser() self.target_seq_path = os.path.join(args_tar.out_folder, "target_seqs") self.srna_seq_path = os.path.join(args_tar.out_folder, "sRNA_seqs") self.rnaplex_path = os.path.join(args_tar.out_folder, "RNAplex") self.rnaup_path = os.path.join(args_tar.out_folder, "RNAup") self.merge_path = os.path.join(args_tar.out_folder, "merge") self.srna_path = os.path.join(args_tar.srnas, "tmp") self.fasta_path = os.path.join(args_tar.fastas, "tmp") self.gff_path = os.path.join(args_tar.gffs, "tmp") self.tmps = {"tmp": "tmp", "rnaup": "tmp_rnaup", "log": "tmp_log", "all_fa": "tmp*.fa", "all_txt": "tmp*.txt"} def _check_gff(self, gffs): for gff in os.listdir(gffs): if gff.endswith(".gff"): self.helper.check_uni_attributes(os.path.join(gffs, gff)) def _run_rnaplfold(self, vienna_path, file_type, win_size, span, unstr_region, seq_path, prefix, out_path): current = os.getcwd() os.chdir(out_path) command = " ".join([os.path.join(vienna_path, "RNAplfold"), "-W", str(win_size), "-L", str(span), "-u", str(unstr_region), "-O"]) if file_type == "sRNA": os.system("<".join([command, os.path.join(current, seq_path, "_".join([self.tmps["tmp"], prefix, file_type + ".fa"]))])) else: os.system("<".join([command, os.path.join(current, seq_path, "_".join([prefix, file_type + ".fa"]))])) os.chdir(current) def _wait_process(self, processes): for p in processes: p.wait() if p.stdout: p.stdout.close() if p.stdin: p.stdin.close() if p.stderr: p.stderr.close() try: p.kill() except OSError: pass time.sleep(5) def _sort_srna_fasta(self, fasta, prefix, path): out = open(os.path.join(path, "_".join([self.tmps["tmp"], prefix, "sRNA.fa"])), "w") srnas = [] with open(fasta) as f_h: for line in f_h: line = line.strip() if line.startswith(">"): name = line[1:] else: srnas.append({"name": name, "seq": line, "len": len(line)}) srnas = sorted(srnas, key=lambda x: (x["len"])) for srna in srnas: out.write(">" + srna["name"].split("|")[0] + "\n") out.write(srna["seq"] + "\n") out.close() def _read_fasta(self, fasta_file): seq = "" with open(fasta_file, "r") as seq_f: for line in seq_f: line = line.strip() if line.startswith(">"): continue else: seq = seq + line return seq def _get_specific_seq(self, srna_file, seq_file, srna_out, querys): for query in querys: srna_datas = query.split(":") srna = {"seq_id": srna_datas[0], "strand": srna_datas[1], "start": int(srna_datas[2]), "end": int(srna_datas[3])} gff_f = open(srna_file, "r") out = open(srna_out, "a") seq = self._read_fasta(seq_file) num = 0 for entry in self.gff_parser.entries(gff_f): if (entry.seq_id == srna["seq_id"]) and ( entry.strand == srna["strand"]) and ( entry.start == srna["start"]) and ( entry.end == srna["end"]): if "ID" in entry.attributes.keys(): id_ = entry.attributes["ID"] else: id_ = entry.feature + str(num) gene = self.helper.extract_gene(seq, entry.start, entry.end, entry.strand) out.write(">{0}|{1}|{2}|{3}|{4}\n{5}\n".format( id_, entry.seq_id, entry.start, entry.end, entry.strand, gene)) num += 1 gff_f.close() out.close() def _gen_seq(self, prefixs, args_tar): print("Generating sRNA fasta files...") for srna in os.listdir(self.srna_path): if srna.endswith("_sRNA.gff"): prefix = srna.replace("_sRNA.gff", "") prefixs.append(prefix) srna_out = os.path.join(self.srna_seq_path, "_".join([prefix, "sRNA.fa"])) if "all" in args_tar.query: self.helper.get_seq( os.path.join(self.srna_path, srna), os.path.join(self.fasta_path, prefix + ".fa"), srna_out) else: if "_".join([prefix, "sRNA.fa"]) in os.listdir( self.srna_seq_path): os.remove(srna_out) self._get_specific_seq( os.path.join(self.srna_path, srna), os.path.join(self.fasta_path, prefix + ".fa"), srna_out, args_tar.query) self._sort_srna_fasta(srna_out, prefix, self.srna_seq_path) print("Generating target fasta files...") for gff in os.listdir(self.gff_path): if gff.endswith(".gff"): prefix = gff.replace(".gff", "") potential_target(os.path.join(self.gff_path, gff), os.path.join(self.fasta_path, prefix + ".fa"), os.path.join(self.target_seq_path), args_tar) file_num = 1 num = 0 sub_prefix = os.path.join(self.target_seq_path, "_".join([prefix, "target"])) sub_out = open("_".join([sub_prefix, str(file_num) + ".fa"]), "w") with open((sub_prefix + ".fa"), "r") as t_f: for line in t_f: line = line.strip() if line.startswith(">"): num += 1 if (num == 100): num = 0 file_num += 1 sub_out.close() sub_out = open("_".join([sub_prefix, str(file_num) + ".fa"]), "w") sub_out.write(line + "\n") sub_out.close() def _run_rnaplex(self, prefix, rnaplfold_path, args_tar): print("Running RNAplex of {0}".format(prefix)) num_process = 0 processes = [] for seq in os.listdir(self.target_seq_path): if (prefix in seq) and ("_target_" in seq): print("Running RNAplex with {0}".format(seq)) out_rnaplex = open(os.path.join( self.rnaplex_path, prefix, "_".join([ prefix, "RNAplex", str(num_process) + ".txt"])), "w") num_process += 1 p = Popen([os.path.join(args_tar.vienna_path, "RNAplex"), "-q", os.path.join( self.srna_seq_path, "_".join([ self.tmps["tmp"], prefix, "sRNA.fa"])), "-t", os.path.join(self.target_seq_path, seq), "-l", str(args_tar.inter_length), "-e", str(args_tar.energy), "-z", str(args_tar.duplex_dist), "-a", rnaplfold_path], stdout=out_rnaplex) processes.append(p) if num_process % args_tar.core_plex == 0: self._wait_process(processes) self._wait_process(processes) return num_process def _rna_plex(self, prefixs, args_tar): for prefix in prefixs: print("Running RNAplfold of {0}".format(prefix)) self.helper.check_make_folder( os.path.join(self.rnaplex_path, prefix)) rnaplfold_path = os.path.join(self.rnaplex_path, prefix, "RNAplfold") os.mkdir(rnaplfold_path) self._run_rnaplfold( args_tar.vienna_path, "sRNA", args_tar.win_size_s, args_tar.span_s, args_tar.unstr_region_rnaplex_s, self.srna_seq_path, prefix, rnaplfold_path) self._run_rnaplfold( args_tar.vienna_path, "target", args_tar.win_size_t, args_tar.span_t, args_tar.unstr_region_rnaplex_t, self.target_seq_path, prefix, rnaplfold_path) num_process = self._run_rnaplex(prefix, rnaplfold_path, args_tar) rnaplex_file = os.path.join(self.rnaplex_path, prefix, "_".join([prefix, "RNAplex.txt"])) if ("_".join([prefix, "RNAplex.txt"]) in os.listdir(os.path.join(self.rnaplex_path, prefix))): os.remove(rnaplex_file) for index in range(0, num_process): self.helper.merge_file(os.path.join( self.rnaplex_path, prefix, "_".join([ prefix, "RNAplex", str(index) + ".txt"])), rnaplex_file) self.helper.remove_all_content(os.path.join( self.rnaplex_path, prefix), "_RNAplex_", "file") self.fixer.fix_rnaplex(rnaplex_file, self.tmps["tmp"]) shutil.move(self.tmps["tmp"], rnaplex_file) def _run_rnaup(self, num_up, processes, out_rnaup, out_log, args_tar): for index in range(1, num_up + 1): out_tmp_up = open(os.path.join( args_tar.out_folder, "".join([self.tmps["rnaup"], str(index), ".txt"])), "w") out_err = open(os.path.join( args_tar.out_folder, "".join([self.tmps["log"], str(index), ".txt"])), "w") in_up = open(os.path.join( args_tar.out_folder, "".join([self.tmps["tmp"], str(index), ".fa"])), "r") p = Popen([os.path.join(args_tar.vienna_path, "RNAup"), "-u", str(args_tar.unstr_region_rnaup), "-o", "--interaction_first"], stdin=in_up, stdout=out_tmp_up, stderr=out_err) processes.append(p) if len(processes) != 0: time.sleep(5) self._wait_process(processes) os.system("rm " + os.path.join(args_tar.out_folder, self.tmps["all_fa"])) self._merge_txt(num_up, out_rnaup, out_log, args_tar.out_folder) os.system("rm " + os.path.join(args_tar.out_folder, self.tmps["all_txt"])) def _merge_txt(self, num_up, out_rnaup, out_log, out_folder): for index in range(1, num_up + 1): self.helper.merge_file( os.path.join(out_folder, "".join([self.tmps["rnaup"], str(index), ".txt"])), out_rnaup) self.helper.merge_file( os.path.join(out_folder, "".join([self.tmps["log"], str(index), ".txt"])), out_log) def _get_continue(self, out_rnaup): srnas = [] matchs = {} out = open("tmp.txt", "w") with open(out_rnaup) as f_h: for line in f_h: line = line.strip() if ">srna" in line: srna = line[1:] srnas.append(srna) matchs[srna] = [] else: matchs[srna].append(line) srnas = srnas[:-1] for srna in srnas: out.write(">" + srna + "\n") for target in matchs[srna]: out.write(target + "\n") out.close() os.remove(out_rnaup) shutil.move("tmp.txt", out_rnaup) return srnas def _rnaup(self, prefixs, args_tar): for prefix in prefixs: srnas = [] print("Running RNAup of {0}".format(prefix)) if not os.path.exists(os.path.join(self.rnaup_path, prefix)): os.mkdir(os.path.join(self.rnaup_path, prefix)) num_up = 0 processes = [] out_rnaup = os.path.join(self.rnaup_path, prefix, "_".join([prefix + "_RNAup.txt"])) out_log = os.path.join(self.rnaup_path, prefix, "_".join([prefix + "_RNAup.log"])) if "_".join([prefix, "RNAup.txt"]) in \ os.listdir(os.path.join(self.rnaup_path, prefix)): if not args_tar.continue_rnaup: os.remove(out_rnaup) os.remove(out_log) else: srnas = self._get_continue(out_rnaup) with open(os.path.join(self.srna_seq_path, "_".join([ self.tmps["tmp"], prefix, "sRNA.fa"])), "r") as s_f: for line in s_f: line = line.strip() if line.startswith(">"): if line[1:] in srnas: start = False continue start = True print("Running RNAup with {0}".format(line[1:])) num_up += 1 out_up = open(os.path.join(args_tar.out_folder, "".join([self.tmps["tmp"], str(num_up), ".fa"])), "w") out_up.write(line + "\n") else: if start: out_up.write(line + "\n") out_up.close() self.helper.merge_file(os.path.join( self.target_seq_path, "_".join([prefix, "target.fa"])), os.path.join(args_tar.out_folder, "".join([self.tmps["tmp"], str(num_up), ".fa"]))) if num_up == args_tar.core_up: self._run_rnaup(num_up, processes, out_rnaup, out_log, args_tar) processes = [] num_up = 0 self._run_rnaup(num_up, processes, out_rnaup, out_log, args_tar) def _merge_rnaplex_rnaup(self, prefixs, args_tar): for prefix in prefixs: rnaplex_file = None rnaup_file = None out_rnaplex = None out_rnaup = None self.helper.check_make_folder(os.path.join( self.merge_path, prefix)) print("Ranking {0} now...".format(prefix)) if (args_tar.program == "both") or (args_tar.program == "RNAplex"): rnaplex_file = os.path.join(self.rnaplex_path, prefix, "_".join([prefix, "RNAplex.txt"])) out_rnaplex = os.path.join( self.rnaplex_path, prefix, "_".join([prefix, "RNAplex_rank.csv"])) if (args_tar.program == "both") or (args_tar.program == "RNAup"): rnaup_file = os.path.join(self.rnaup_path, prefix, "_".join([prefix, "RNAup.txt"])) out_rnaup = os.path.join(self.rnaup_path, prefix, "_".join([prefix, "RNAup_rank.csv"])) merge_srna_target(rnaplex_file, rnaup_file, args_tar, out_rnaplex, out_rnaup, os.path.join(self.merge_path, prefix, "_".join([prefix, "merge.csv"])), os.path.join(self.merge_path, prefix, "_".join([prefix, "overlap.csv"])), os.path.join(self.srna_path, "_".join([prefix, "sRNA.gff"])), os.path.join(self.gff_path, prefix + ".gff")) def run_srna_target_prediction(self, args_tar): self._check_gff(args_tar.gffs) self._check_gff(args_tar.srnas) self.multiparser.parser_gff(args_tar.gffs, None) self.multiparser.parser_fasta(args_tar.fastas) self.multiparser.parser_gff(args_tar.srnas, "sRNA") prefixs = [] self._gen_seq(prefixs, args_tar) if (args_tar.program == "both") or ( args_tar.program == "RNAplex"): self._rna_plex(prefixs, args_tar) self.helper.remove_all_content(self.target_seq_path, "_target_", "file") if (args_tar.program == "both") or ( args_tar.program == "RNAup"): self._rnaup(prefixs, args_tar) self._merge_rnaplex_rnaup(prefixs, args_tar) if (args_tar.program == "RNAplex") or ( args_tar.program == "both"): for strain in os.listdir(os.path.join( args_tar.out_folder, "RNAplex")): shutil.rmtree(os.path.join(args_tar.out_folder, "RNAplex", strain, "RNAplfold")) self.helper.remove_all_content(args_tar.out_folder, self.tmps["tmp"], "dir") self.helper.remove_all_content(args_tar.out_folder, self.tmps["tmp"], "file") self.helper.remove_tmp(args_tar.gffs) self.helper.remove_tmp(args_tar.srnas) self.helper.remove_tmp(args_tar.fastas) self.helper.remove_all_content(self.srna_seq_path, "tmp_", "file")