class TestConverter(unittest.TestCase): def setUp(self): self.converter = Converter() self.example = Example() self.converter.gff3parser = Mock_gff3_parser self.converter._print_rntptt_title = Mock_func().print_rntptt_title self.converter.tsspredator = Mock_TSSPredatorReader() self.converter._read_file = Mock_func().mock_read_file self.gff_file = self.example.gff_file self.ptt_out = self.example.ptt_out self.rnt_out = self.example.rnt_out self.srna_out = self.example.srna_out self.embl_file = self.example.embl_file self.embl_out = self.example.embl_out self.multi_embl = self.example.multi_embl self.gff_out = self.example.gff_out self.mastertable = self.example.mastertable self.tss_file = self.example.tss_file self.fasta_file = self.example.fasta_file self.transterm = self.example.transterm self.term_file = self.example.term_file self.circ_file = self.example.circrna_table self.circ_all = self.example.circrna_all self.circ_best = self.example.circrna_best self.test_folder = "test_folder" self.mock_args = MockClass() if (not os.path.exists(self.test_folder)): os.mkdir(self.test_folder) def tearDown(self): if os.path.exists(self.test_folder): shutil.rmtree(self.test_folder) def test_print_rntptt_file(self): cdss = [] genes = [] rnas = [] gff_dict = Example().gff_dict for gff in gff_dict: if gff["feature"] == "gene": genes.append(self.converter.gff3parser.entries(self, gff)) elif gff["feature"] == "CDS": cdss.append(self.converter.gff3parser.entries(self, gff)) elif gff["feature"] == "tRNA": rnas.append(self.converter.gff3parser.entries(self, gff)) out_p = StringIO() out_r = StringIO() self.converter._print_rntptt_file(out_p, cdss, genes) self.converter._print_rntptt_file(out_r, rnas, genes) self.assertEqual(out_p.getvalue().split("\n")[:-1], self.example.ptt_out_list) self.assertEqual(out_r.getvalue().split("\n")[:-1], self.example.rnt_out_list) out_p.close() out_r.close() def test_srna2pttrnt(self): srna_input_file = os.path.join(self.test_folder, "srna.gff") srna_output_file = os.path.join(self.test_folder, "srna.out") with open(srna_input_file, "w") as fh: fh.write(self.gff_file) srnas = [] self.converter._srna2rntptt(srna_input_file, srna_output_file, srnas, 1234567) datas = import_data(srna_output_file) self.assertEqual(set(datas), set(self.srna_out.split("\n"))) def test_multi_embl_pos(self): embls = [] for line in self.embl_file.split("\n"): datas = self.converter._multi_embl_pos(line.strip()) if datas != "Wrong": embls.append(datas) for index in range(0, 7): self.assertDictEqual(embls[index], self.embl_out[index]) for index in range(0, 2): self.assertDictEqual(embls[-1]["pos"][index], self.multi_embl[index]) def test_parser_embl_data(self): embl_file = os.path.join(self.test_folder, "test.embl") embl_out = os.path.join(self.test_folder, "test.embl_out") out = StringIO() with open(embl_file, "w") as eh: for line in self.embl_file.split("\n"): eh.write(line + "\n") info = self.converter._parser_embl_data(embl_file, out) datas = out.getvalue().split("\n") self.assertEqual(set(datas[:-1]), set(self.gff_out.split("\n"))) self.assertEqual(info[0], "NC_007795.1") for index in range(0, 2): self.assertDictEqual(info[1]["pos"][index], self.multi_embl[index]) out.close() def test_multi_tss_class(self): nums = {"tss": 0, "tss_uni": 0, "class": 1} utrs = {"total": [], "pri": [], "sec": []} tss_features = {"tss_types": [], "locus_tags": [], "utr_lengths": []} tss_index = defaultdict(lambda: 0) master_file = os.path.join(self.test_folder, "test.tsv") fh = StringIO(self.mastertable) for tss in self.converter.tsspredator.entries(fh): self.converter._multi_tss_class(tss, tss_index, tss_features, nums, utrs) fh.close() self.assertDictEqual(nums, {'tss_uni': 0, 'class': 5, 'tss': 2}) def test_convert_mastertable2gff(self): master_file = os.path.join(self.test_folder, "test.tsv") with open(master_file, "w") as th: th.write(self.mastertable) out_gff = os.path.join(self.test_folder, "test.tsv_out") self.converter.convert_mastertable2gff(master_file, "ANNOgesic", "TSS", "aaa", out_gff) datas = import_data(out_gff) self.assertEqual(set(datas), set(self.tss_file.split("\n"))) def test_convert_gff2rntptt(self): srna_input_file = os.path.join(self.test_folder, "srna.gff") srna_output_file = os.path.join(self.test_folder, "srna.out") gff_file = os.path.join(self.test_folder, "test.gff") rnt_file = os.path.join(self.test_folder, "test.rnt") ptt_file = os.path.join(self.test_folder, "test.ptt") fasta_file = os.path.join(self.test_folder, "test.fa") with open(srna_input_file, "w") as fh: fh.write(self.gff_file) with open(gff_file, "w") as fh: fh.write(self.gff_file) with open(fasta_file, "w") as fh: fh.write(self.fasta_file) self.converter.convert_gff2rntptt(gff_file, fasta_file, ptt_file, rnt_file, srna_input_file, srna_output_file) self.assertTrue(srna_output_file) self.assertTrue(rnt_file) self.assertTrue(ptt_file) def test_convert_embl2gff(self): embl_file = os.path.join(self.test_folder, "test.embl") gff_file = os.path.join(self.test_folder, "test.embl_out") with open(embl_file, "w") as eh: for line in self.embl_file.split("\n"): eh.write(line + "\n") self.converter.convert_embl2gff(embl_file, gff_file) datas = import_data(gff_file) self.assertEqual(set(datas[1:-2]), set(self.gff_out.split("\n"))) def test_convert_transtermhp2gff(self): transterm_file = os.path.join(self.test_folder, "test_best_terminator_after_gene.bag") gff_file = os.path.join(self.test_folder, "transterm.gff") with open(transterm_file, "w") as th: th.write(self.transterm) self.converter.convert_transtermhp2gff(transterm_file, gff_file) datas = import_data(gff_file) self.assertEqual(set(datas), set(self.term_file.split("\n"))) def get_info(datas): f_datas = [] for data in datas: if not data.startswith("#"): f_datas.append("\t".join(data.split("\t")[:8])) return f_datas def test_convert_circ2gff(self): circ_file = os.path.join(self.test_folder, "circ.csv") out_all = os.path.join(self.test_folder, "all.gff") out_filter = os.path.join(self.test_folder, "best.gff") with open(circ_file, "w") as ch: ch.write(self.circ_file) args = self.mock_args.mock() args.start_ratio = 0.5 args.end_ratio = 0.5 args.support = 5 self.converter.convert_circ2gff(circ_file, args, out_all, out_filter) datas = import_data(out_all) f_datas = [] for data in datas: if not data.startswith("#"): f_datas.append("\t".join(data.split("\t")[:8])) c_datas = [] for data in self.circ_all.split("\n"): if not data.startswith("#"): c_datas.append("\t".join(data.split("\t")[:8])) self.assertListEqual(f_datas, c_datas) datas = import_data(out_filter) f_datas = [] for data in datas: if not data.startswith("#"): f_datas.append("\t".join(data.split("\t")[:8])) c_datas = [] for data in self.circ_best.split("\n"): if not data.startswith("#"): c_datas.append("\t".join(data.split("\t")[:8])) self.assertListEqual(f_datas, c_datas)
class RATT(object): '''annotation transfer''' def __init__(self, args_ratt): self.multiparser = Multiparser() self.converter = Converter() self.format_fixer = FormatFixer() self.helper = Helper() if args_ratt.ref_gbk: self.gbk = os.path.join(args_ratt.ref_gbk, "gbk_tmp") self.gbk_tmp = os.path.join(self.gbk, "tmp") self.embl = os.path.join(args_ratt.ref_gbk, "embls") if args_ratt.ref_embls: self.embl = args_ratt.ref_embls self.ratt_log = os.path.join(args_ratt.output_path, "ratt_log.txt") self.tmp_files = { "tar": os.path.join(args_ratt.tar_fastas, "tmp"), "ref": os.path.join(args_ratt.ref_fastas, "tmp"), "out_gff": os.path.join(args_ratt.gff_outfolder, "tmp"), "gff": os.path.join(args_ratt.gff_outfolder, "tmp.gff"), "ptt": os.path.join(args_ratt.gff_outfolder, "tmp.ptt"), "rnt": os.path.join(args_ratt.gff_outfolder, "tmp.rnt") } def _convert_to_pttrnt(self, gffs, files): for gff in files: if gff.endswith(".gff"): gff = os.path.join(gffs, gff) filename = gff.split("/") prefix = filename[-1][:-4] rnt = gff[:-3] + "rnt" ptt = gff[:-3] + "ptt" fasta = self.helper.get_correct_file(self.tmp_files["tar"], ".fa", prefix, None, None) if fasta: self.converter.convert_gff2rntptt(gff, fasta, ptt, rnt, None, None) def _remove_files(self, args_ratt, out_gbk): self.helper.remove_all_content(args_ratt.gff_outfolder, ".gff", "file") self.helper.remove_all_content(args_ratt.gff_outfolder, ".ptt", "file") self.helper.remove_all_content(args_ratt.gff_outfolder, ".rnt", "file") self.helper.move_all_content(self.tmp_files["out_gff"], args_ratt.gff_outfolder, None) shutil.rmtree(self.tmp_files["out_gff"]) shutil.rmtree(self.tmp_files["tar"]) shutil.rmtree(self.tmp_files["ref"]) self.helper.remove_tmp_dir(args_ratt.tar_fastas) self.helper.remove_tmp_dir(args_ratt.ref_fastas) self.helper.remove_tmp_dir(args_ratt.ref_embls) self.helper.remove_tmp_dir(args_ratt.ref_gbk) def _convert_to_gff(self, ratt_result, args_ratt, files): name = ratt_result.split(".") filename = ".".join(name[1:-2]) + ".gff" output_file = os.path.join(args_ratt.output_path, filename) self.converter.convert_embl2gff( os.path.join(args_ratt.output_path, ratt_result), output_file) self.format_fixer.fix_ratt(output_file, ".".join(name[1:-2]), "tmp_gff") shutil.move("tmp_gff", output_file) shutil.copy(output_file, os.path.join(args_ratt.gff_outfolder, filename)) files.append(filename) def _parser_embl_gbk(self, files): self.helper.check_make_folder(self.gbk) for file_ in files: close = False with open(file_, "r") as f_h: for line in f_h: if (line.startswith("LOCUS")): out = open(self.gbk_tmp, "w") datas = line.split(" ") for data in datas: if (len(data) != 0) and (data != "LOCUS"): filename = ".".join([data, "gbk"]) break elif (line.startswith("VERSION")): datas = line.split(" ") for data in datas: if (len(data) != 0) and (data != "VERSION"): new_filename = ".".join([data, "gbk"]) break if new_filename.find(filename): filename = new_filename if out: out.write(line) if line.startswith("//"): out.close() close = True shutil.move(self.gbk_tmp, os.path.join(self.gbk, filename)) if not close: out.close() return self.gbk def _convert_embl(self, ref_embls): '''convert gbk to embl''' detect_gbk = False gbks = [] out_gbk = None for embl in os.listdir(ref_embls): if (embl.endswith(".gbk")) or (embl.endswith(".gbff")) or ( embl.endswith(".gb")): detect_gbk = True gbks.append(os.path.join(ref_embls, embl)) if not detect_gbk: print("Error: Please assign proper Genebank files!") sys.exit() elif detect_gbk: out_gbk = self._parser_embl_gbk(gbks) self.converter.convert_gbk2embl(out_gbk) self.helper.check_make_folder(self.embl) self.helper.move_all_content(out_gbk, self.embl, [".embl"]) return out_gbk def _run_ratt(self, args_ratt, tar, ref, out): call([ args_ratt.ratt_path, self.embl, os.path.join(self.tmp_files["tar"], tar + ".fa"), args_ratt.element, args_ratt.transfer_type, os.path.join(self.tmp_files["ref"], ref + ".fa") ], stdout=out, stderr=DEVNULL) def _format_and_run(self, args_ratt): print("Running RATT") for pair in args_ratt.pairs: ref = pair.split(":")[0] tar = pair.split(":")[1] out = open(self.ratt_log, "w+") self._run_ratt(args_ratt, tar, ref, out) for filename in os.listdir(): if ("final" in filename): shutil.move(filename, os.path.join(args_ratt.output_path, filename)) elif (args_ratt.element in filename) or ( "query" in filename) or ("Reference" in filename) or ( "Query" in filename) or ("Sequences" in filename): if os.path.isfile(filename): os.remove(filename) if os.path.isdir(filename): shutil.rmtree(filename) out.close() def annotation_transfer(self, args_ratt): self.multiparser.parser_fasta(args_ratt.tar_fastas) self.multiparser.parser_fasta(args_ratt.ref_fastas) out_gbk = None if args_ratt.ref_embls is None: out_gbk = self._convert_embl(args_ratt.ref_gbk) self._format_and_run(args_ratt) if args_ratt.convert: files = [] for data in os.listdir(args_ratt.output_path): if "final.embl" in data: self._convert_to_gff(data, args_ratt, files) self._convert_to_pttrnt(args_ratt.gff_outfolder, files) self.helper.check_make_folder(self.tmp_files["out_gff"]) for folder in os.listdir(args_ratt.tar_fastas): files = [] if "_folder" in folder: datas = folder.split("_folder") prefix = ".".join(datas[0].split(".")[:-1]) for file_ in os.listdir( os.path.join(args_ratt.tar_fastas, folder)): files.append(file_[:-3]) for gff in os.listdir(args_ratt.gff_outfolder): for file_ in files: if (".gff" in gff) and (file_ == gff[:-4]): self.helper.merge_file( os.path.join(args_ratt.gff_outfolder, gff), self.tmp_files["gff"]) if (".ptt" in gff) and (file_ == gff[:-4]): self.helper.merge_file( os.path.join(args_ratt.gff_outfolder, gff), self.tmp_files["ptt"]) if (".rnt" in gff) and (file_ == gff[:-4]): self.helper.merge_file( os.path.join(args_ratt.gff_outfolder, gff), self.tmp_files["rnt"]) if os.path.exists(self.tmp_files["gff"]): shutil.move( self.tmp_files["gff"], os.path.join(self.tmp_files["out_gff"], prefix + ".gff")) shutil.move( self.tmp_files["ptt"], os.path.join(self.tmp_files["out_gff"], prefix + ".ptt")) shutil.move( self.tmp_files["rnt"], os.path.join(self.tmp_files["out_gff"], prefix + ".rnt")) else: print("Error: Please check your fasta or " "annotation files, they should only contain " "the query genome. And make sure your RATT can " "work properly (check $ANNOgesic/output/" "annotation_transfer/ratt_log.txt).") self._remove_files(args_ratt, out_gbk)
class RATT(object): def __init__(self, args_ratt): self.multiparser = Multiparser() self.converter = Converter() self.format_fixer = FormatFixer() self.helper = Helper() self.gbk = os.path.join(args_ratt.ref_embls, "gbk_tmp") self.gbk_tmp = os.path.join(self.gbk, "tmp") self.embl = os.path.join(args_ratt.ref_embls, "embls") self.ratt_log = os.path.join(args_ratt.output_path, "ratt_log.txt") self.tmp_files = {"tar": os.path.join(args_ratt.tar_fastas, "tmp"), "ref": os.path.join(args_ratt.ref_fastas, "tmp"), "out_gff": os.path.join(args_ratt.gff_outfolder, "tmp"), "gff": os.path.join(args_ratt.gff_outfolder, "tmp.gff"), "ptt": os.path.join(args_ratt.gff_outfolder, "tmp.ptt"), "rnt": os.path.join(args_ratt.gff_outfolder, "tmp.rnt")} def _convert_to_pttrnt(self, gffs, files): for gff in files: if gff.endswith(".gff"): gff = os.path.join(gffs, gff) filename = gff.split("/") prefix = filename[-1][:-4] rnt = gff[:-3] + "rnt" ptt = gff[:-3] + "ptt" fasta = self.helper.get_correct_file(self.tmp_files["tar"], ".fa", prefix, None, None) if fasta: self.converter.convert_gff2rntptt(gff, fasta, ptt, rnt, None, None) def _remove_files(self, args_ratt, out_gbk): self.helper.remove_all_content(args_ratt.gff_outfolder, ".gff", "file") self.helper.remove_all_content(args_ratt.gff_outfolder, ".ptt", "file") self.helper.remove_all_content(args_ratt.gff_outfolder, ".rnt", "file") self.helper.move_all_content(self.tmp_files["out_gff"], args_ratt.gff_outfolder, None) shutil.rmtree(self.tmp_files["out_gff"]) shutil.rmtree(self.tmp_files["tar"]) shutil.rmtree(self.tmp_files["ref"]) shutil.rmtree(self.embl) self.helper.remove_all_content(args_ratt.tar_fastas, "_folder", "dir") self.helper.remove_all_content(args_ratt.ref_fastas, "_folder", "dir") if out_gbk: shutil.rmtree(out_gbk) def _convert_to_gff(self, ratt_result, args_ratt, files): name = ratt_result.split(".") filename = ".".join(name[1:-2]) + ".gff" output_file = os.path.join(args_ratt.output_path, filename) self.converter.convert_embl2gff( os.path.join(args_ratt.output_path, ratt_result), output_file) self.format_fixer.fix_ratt(output_file, ".".join(name[1:-2]), "tmp_gff") shutil.move("tmp_gff", output_file) shutil.copy(output_file, os.path.join(args_ratt.gff_outfolder, filename)) files.append(filename) def _parser_embl_gbk(self, files): self.helper.check_make_folder(self.gbk) for file_ in files: close = False with open(file_, "r") as f_h: for line in f_h: if (line.startswith("LOCUS")): out = open(self.gbk_tmp, "w") datas = line.split(" ") for data in datas: if (len(data) != 0) and (data != "LOCUS"): filename = ".".join([data, "gbk"]) break elif (line.startswith("VERSION")): datas = line.split(" ") for data in datas: if (len(data) != 0) and (data != "VERSION"): new_filename = ".".join([data, "gbk"]) break if new_filename.find(filename): filename = new_filename if out: out.write(line) if line.startswith("//"): out.close() close = True shutil.move(self.gbk_tmp, os.path.join(self.gbk, filename)) if not close: out.close() return self.gbk def _convert_embl(self, ref_embls): detect_gbk = False gbks = [] out_gbk = None for embl in os.listdir(ref_embls): if embl.endswith(".gbk"): detect_gbk = True gbks.append(os.path.join(ref_embls, embl)) if not detect_gbk: print("Error: please assign proper folder for Genebank file!!!") sys.exit() elif detect_gbk: out_gbk = self._parser_embl_gbk(gbks) self.converter.convert_gbk2embl(out_gbk) self.helper.check_make_folder(self.embl) self.helper.move_all_content(out_gbk, self.embl, [".embl"]) return out_gbk def _run_ratt(self, args_ratt, tar, ref, out): call([args_ratt.ratt_path, self.embl, os.path.join(self.tmp_files["tar"], tar + ".fa"), args_ratt.element, args_ratt.transfer_type, os.path.join(self.tmp_files["ref"], ref + ".fa")], stdout=out, stderr=DEVNULL) def _format_and_run(self, args_ratt): print("Running RATT...") for pair in args_ratt.pairs: ref = pair.split(":")[0] tar = pair.split(":")[1] out = open(self.ratt_log, "w+") print(tar) self._run_ratt(args_ratt, tar, ref, out) for filename in os.listdir(): if ("final" in filename): shutil.move(filename, os.path.join(args_ratt.output_path, filename)) elif (args_ratt.element in filename) or ( "query" in filename) or ( "Reference" in filename) or ( "Query" in filename) or ( "Sequences" in filename): if os.path.isfile(filename): os.remove(filename) if os.path.isdir(filename): shutil.rmtree(filename) out.close() def annotation_transfer(self, args_ratt): self.multiparser.parser_fasta(args_ratt.tar_fastas) self.multiparser.parser_fasta(args_ratt.ref_fastas) out_gbk = self._convert_embl(args_ratt.ref_embls) self._format_and_run(args_ratt) if args_ratt.convert: files = [] for data in os.listdir(args_ratt.output_path): if "final.embl" in data: self._convert_to_gff(data, args_ratt, files) self._convert_to_pttrnt(args_ratt.gff_outfolder, files) self.helper.check_make_folder(self.tmp_files["out_gff"]) for folder in os.listdir(args_ratt.tar_fastas): files = [] if "_folder" in folder: datas = folder.split("_folder") prefix = datas[0][:-3] for file_ in os.listdir(os.path.join(args_ratt.tar_fastas, folder)): files.append(file_[:-3]) for gff in os.listdir(args_ratt.gff_outfolder): for file_ in files: if (".gff" in gff) and (file_ == gff[:-4]): self.helper.merge_file(os.path.join( args_ratt.gff_outfolder, gff), self.tmp_files["gff"]) if (".ptt" in gff) and (file_ == gff[:-4]): self.helper.merge_file(os.path.join( args_ratt.gff_outfolder, gff), self.tmp_files["ptt"]) if (".rnt" in gff) and (file_ == gff[:-4]): self.helper.merge_file(os.path.join( args_ratt.gff_outfolder, gff), self.tmp_files["rnt"]) shutil.move(self.tmp_files["gff"], os.path.join( self.tmp_files["out_gff"], prefix + ".gff")) shutil.move(self.tmp_files["ptt"], os.path.join( self.tmp_files["out_gff"], prefix + ".ptt")) shutil.move(self.tmp_files["rnt"], os.path.join( self.tmp_files["out_gff"], prefix + ".rnt")) self._remove_files(args_ratt, out_gbk)
class RATT(object): '''annotation transfer''' def __init__(self, args_ratt): self.multiparser = Multiparser() self.converter = Converter() self.format_fixer = FormatFixer() self.helper = Helper() if args_ratt.ref_gbk: self.gbk = os.path.join(args_ratt.ref_gbk, "gbk_tmp") self.gbk_tmp = os.path.join(self.gbk, "tmp") self.embl = os.path.join(args_ratt.ref_gbk, "embls") if args_ratt.ref_embls: self.embl = args_ratt.ref_embls self.ratt_log = os.path.join(args_ratt.output_path, "ratt_log.txt") self.tmp_files = { "tar": os.path.join(args_ratt.tar_fastas, "tmp"), "ref": os.path.join(args_ratt.ref_fastas, "tmp"), "out_gff": os.path.join(args_ratt.gff_outfolder, "tmp"), "gff": os.path.join(args_ratt.gff_outfolder, "tmp.gff"), "ptt": os.path.join(args_ratt.gff_outfolder, "tmp.ptt"), "rnt": os.path.join(args_ratt.gff_outfolder, "tmp.rnt") } def _convert_to_pttrnt(self, gffs, files, log): for gff in files: if gff.endswith(".gff"): gff = os.path.join(gffs, gff) filename = gff.split("/") prefix = filename[-1][:-4] rnt = gff[:-3] + "rnt" ptt = gff[:-3] + "ptt" fasta = self.helper.get_correct_file(self.tmp_files["tar"], ".fa", prefix, None, None) if fasta: self.converter.convert_gff2rntptt(gff, fasta, ptt, rnt, None, None) log.write("\t" + ptt + " is generated.\n") log.write("\t" + rnt + " is generated.\n") def _remove_files(self, args_ratt, out_gbk, log): self.helper.remove_all_content(args_ratt.gff_outfolder, ".gff", "file") self.helper.remove_all_content(args_ratt.gff_outfolder, ".ptt", "file") self.helper.remove_all_content(args_ratt.gff_outfolder, ".rnt", "file") log.write("Moving the final output files to {0}.\n".format( args_ratt.gff_outfolder)) self.helper.move_all_content(self.tmp_files["out_gff"], args_ratt.gff_outfolder, None) log.write("Remove the temperary files.\n") shutil.rmtree(self.tmp_files["out_gff"]) shutil.rmtree(self.tmp_files["tar"]) shutil.rmtree(self.tmp_files["ref"]) self.helper.remove_tmp_dir(args_ratt.tar_fastas) self.helper.remove_tmp_dir(args_ratt.ref_fastas) self.helper.remove_tmp_dir(args_ratt.ref_embls) self.helper.remove_tmp_dir(args_ratt.ref_gbk) def _convert_to_gff(self, ratt_result, args_ratt, files, log): name = ratt_result.split(".") filename = ".".join(name[1:-2]) + ".gff" output_file = os.path.join(args_ratt.output_path, filename) self.converter.convert_embl2gff( os.path.join(args_ratt.output_path, ratt_result), output_file) self.format_fixer.fix_ratt(output_file, ".".join(name[1:-2]), "tmp_gff") shutil.move("tmp_gff", output_file) shutil.copy(output_file, os.path.join(args_ratt.gff_outfolder, filename)) log.write("\t" + os.path.join(args_ratt.gff_outfolder, filename) + " is generated.\n") files.append(filename) def _parser_embl_gbk(self, files): self.helper.check_make_folder(self.gbk) for file_ in files: close = False with open(file_, "r") as f_h: for line in f_h: if (line.startswith("LOCUS")): out = open(self.gbk_tmp, "w") datas = line.split(" ") for data in datas: if (len(data) != 0) and (data != "LOCUS"): filename = ".".join([data.strip(), "gbk"]) break elif (line.startswith("VERSION")): datas = line.split(" ") for data in datas: if (len(data) != 0) and (data != "VERSION"): new_filename = ".".join([data.strip(), "gbk"]) break if new_filename.find(filename): filename = new_filename if out: out.write(line) if line.startswith("//"): out.close() close = True shutil.move(self.gbk_tmp, os.path.join(self.gbk, filename)) if not close: out.close() return self.gbk def _convert_embl(self, ref_embls, log): '''convert gbk to embl''' detect_gbk = False gbks = [] out_gbk = None for embl in os.listdir(ref_embls): if (embl.endswith(".gbk")) or (embl.endswith(".gbff")) or ( embl.endswith(".gb")): detect_gbk = True gbks.append(os.path.join(ref_embls, embl)) if not detect_gbk: log.write( "--related_gbk_files is assigned, but not gbk files are detected.\n" "The gbk file names need to be ended at .gbk, .gb, or .gbff. \n" ) print("Error: Please assign proper Genebank files!") sys.exit() elif detect_gbk: out_gbk = self._parser_embl_gbk(gbks) log.write( "Running converter.py to convert gbk file to embl format.\n") self.converter.convert_gbk2embl(out_gbk) self.helper.check_make_folder(self.embl) self.helper.move_all_content(out_gbk, self.embl, [".embl"]) log.write("\t" + self.embl + " is generated and the embl files are stored in it.\n") return out_gbk def _run_ratt(self, args_ratt, tar, ref, out, log): if (not os.path.exists(self.embl)) or (not os.path.exists( os.path.join(self.tmp_files["tar"], tar + ".fa"))) or ( not os.path.exists( os.path.join(self.tmp_files["ref"], ref + ".fa"))): print("Error: Please check --compare_pair, the strain names " "should be the same as the strain names in fasta, " "genbank or embl files!") log.write( "The strain names in --compare_pair should be the same " "as the strain names in fasta, genbank, or embl files.\n") sys.exit() log.write("Make sure your RATT version is at least 1.64.\n") log.write("If the RATT can not run properly, please check the " "RATT_HOME and PAGIT_HOME is assigned correctly.\n") log.write(" ".join([ args_ratt.ratt_path, self.embl, os.path.join(self.tmp_files["tar"], tar + ".fa"), args_ratt.element, args_ratt.transfer_type, os.path.join(self.tmp_files["ref"], ref + ".fa") ]) + "\n") call([ args_ratt.ratt_path, self.embl, os.path.join(self.tmp_files["tar"], tar + ".fa"), args_ratt.element, args_ratt.transfer_type, os.path.join(self.tmp_files["ref"], ref + ".fa") ], stdout=out, stderr=DEVNULL) log.write("Done!\n") def _format_and_run(self, args_ratt, log): print("Running RATT") for pair in args_ratt.pairs: ref = pair.split(":")[0] tar = pair.split(":")[1] out = open(self.ratt_log, "w+") self._run_ratt(args_ratt, tar, ref, out, log) log.write("The following files are generatd:\n") for filename in os.listdir(): if ("final" in filename): log.write("\t" + filename + "\n") shutil.move(filename, os.path.join(args_ratt.output_path, filename)) elif (args_ratt.element in filename) or ( "query" in filename) or ("Reference" in filename) or ( "Query" in filename) or ("Sequences" in filename): log.write("\t" + filename + "\n") if os.path.isfile(filename): os.remove(filename) if os.path.isdir(filename): shutil.rmtree(filename) out.close() def annotation_transfer(self, args_ratt, log): self.multiparser.parser_fasta(args_ratt.tar_fastas) self.multiparser.parser_fasta(args_ratt.ref_fastas) out_gbk = None if args_ratt.ref_embls is None: out_gbk = self._convert_embl(args_ratt.ref_gbki, log) self._format_and_run(args_ratt, log) files = [] for data in os.listdir(args_ratt.output_path): if "final.embl" in data: log.write( "Running converter.py to convert embl " "files in {0} to gff, ptt, and rnt format.\n".format(data)) self._convert_to_gff(data, args_ratt, files, log) self._convert_to_pttrnt(args_ratt.gff_outfolder, files, log) self.helper.check_make_folder(self.tmp_files["out_gff"]) log.write("Merging the output of {0}.\n".format(data)) for folder in os.listdir(args_ratt.tar_fastas): files = [] if "_folder" in folder: datas = folder.split("_folder") prefix = ".".join(datas[0].split(".")[:-1]) for file_ in os.listdir( os.path.join(args_ratt.tar_fastas, folder)): files.append(file_[:-3]) for gff in os.listdir(args_ratt.gff_outfolder): for file_ in files: if (".gff" in gff) and (file_ == gff[:-4]): self.helper.merge_file( os.path.join(args_ratt.gff_outfolder, gff), self.tmp_files["gff"]) if (".ptt" in gff) and (file_ == gff[:-4]): self.helper.merge_file( os.path.join(args_ratt.gff_outfolder, gff), self.tmp_files["ptt"]) if (".rnt" in gff) and (file_ == gff[:-4]): self.helper.merge_file( os.path.join(args_ratt.gff_outfolder, gff), self.tmp_files["rnt"]) if os.path.exists(self.tmp_files["gff"]): shutil.move( self.tmp_files["gff"], os.path.join(self.tmp_files["out_gff"], prefix + ".gff")) shutil.move( self.tmp_files["ptt"], os.path.join(self.tmp_files["out_gff"], prefix + ".ptt")) shutil.move( self.tmp_files["rnt"], os.path.join(self.tmp_files["out_gff"], prefix + ".rnt")) else: print("Error: Please check your fasta or " "annotation files, they should only contain " "the query genome. And make sure your RATT can " "work properly (check $ANNOgesic/output/" "annotation_transfer/ratt_log.txt).") log.write("Please check your fasta or " "annotation files, they should only contain " "the query genome. And make sure your RATT can " "work properly (check $ANNOgesic/output/" "annotation_transfer/ratt_log.txt).\n") self._remove_files(args_ratt, out_gbk, log)
class TestConverter(unittest.TestCase): def setUp(self): self.converter = Converter() self.example = Example() self.converter.gff3parser = Mock_gff3_parser self.converter._print_rntptt_title = Mock_func().print_rntptt_title self.converter.tsspredator = Mock_TSSPredatorReader() self.converter._read_file = Mock_func().mock_read_file self.gff_file = self.example.gff_file self.ptt_out = self.example.ptt_out self.rnt_out = self.example.rnt_out self.srna_out = self.example.srna_out self.embl_file = self.example.embl_file self.embl_out = self.example.embl_out self.multi_embl = self.example.multi_embl self.gff_out = self.example.gff_out self.mastertable = self.example.mastertable self.tss_file = self.example.tss_file self.fasta_file = self.example.fasta_file self.transterm = self.example.transterm self.term_file = self.example.term_file self.circ_file = self.example.circrna_table self.circ_all = self.example.circrna_all self.circ_best = self.example.circrna_best self.test_folder = "test_folder" self.mock_args = MockClass() if (not os.path.exists(self.test_folder)): os.mkdir(self.test_folder) def tearDown(self): if os.path.exists(self.test_folder): shutil.rmtree(self.test_folder) def test_print_rntptt_file(self): cdss = [] genes = [] rnas = [] gff_dict = Example().gff_dict for gff in gff_dict: if gff["feature"] == "gene": genes.append(self.converter.gff3parser.entries(self, gff)) elif gff["feature"] == "CDS": cdss.append(self.converter.gff3parser.entries(self, gff)) elif gff["feature"] == "tRNA": rnas.append(self.converter.gff3parser.entries(self, gff)) out_p = StringIO() out_r = StringIO() self.converter._print_rntptt_file(out_p, cdss, genes) self.converter._print_rntptt_file(out_r, rnas, genes) self.assertEqual(out_p.getvalue().split("\n")[:-1], self.example.ptt_out_list) self.assertEqual(out_r.getvalue().split("\n")[:-1], self.example.rnt_out_list) out_p.close() out_r.close() def test_srna2pttrnt(self): srna_input_file = os.path.join(self.test_folder, "srna.gff") srna_output_file = os.path.join(self.test_folder, "srna.out") with open(srna_input_file, "w") as fh: fh.write(self.gff_file) srnas = [] self.converter._srna2rntptt(srna_input_file, srna_output_file, srnas, 1234567) datas = import_data(srna_output_file) self.assertEqual(set(datas), set(self.srna_out.split("\n"))) def test_multi_embl_pos(self): embls = [] for line in self.embl_file.split("\n"): datas = self.converter._multi_embl_pos(line.strip()) if datas != "Wrong": embls.append(datas) for index in range(0, 7): self.assertDictEqual(embls[index], self.embl_out[index]) for index in range(0, 2): self.assertDictEqual(embls[-1]["pos"][index], self.multi_embl[index]) def test_parser_embl_data(self): embl_file = os.path.join(self.test_folder, "test.embl") embl_out = os.path.join(self.test_folder, "test.embl_out") out = StringIO() with open(embl_file, "w") as eh: for line in self.embl_file.split("\n"): eh.write(line + "\n") info = self.converter._parser_embl_data(embl_file, out) datas = out.getvalue().split("\n") self.assertEqual(set(datas[:-1]), set(self.gff_out.split("\n"))) self.assertEqual(info[0], "NC_007795.1") for index in range(0, 2): self.assertDictEqual(info[1]["pos"][index], self.multi_embl[index]) out.close() def test_multi_tss_class(self): nums = {"tss": 0, "tss_uni": 0, "class": 1} utrs = {"total": [], "pri": [], "sec": []} tss_features = {"tss_types": [], "locus_tags": [], "utr_lengths": []} tss_index = defaultdict(lambda: 0) master_file = os.path.join(self.test_folder, "test.tsv") fh = StringIO(self.mastertable) for tss in self.converter.tsspredator.entries(fh): self.converter._multi_tss_class( tss, tss_index, tss_features, nums, utrs) fh.close() self.assertDictEqual(nums, {'tss_uni': 0, 'class': 5, 'tss': 2}) def test_convert_mastertable2gff(self): master_file = os.path.join(self.test_folder, "test.tsv") with open(master_file, "w") as th: th.write(self.mastertable) out_gff = os.path.join(self.test_folder, "test.tsv_out") self.converter.convert_mastertable2gff(master_file, "ANNOgesic", "TSS", "aaa", out_gff) datas = import_data(out_gff) self.assertEqual(set(datas), set(self.tss_file.split("\n"))) def test_convert_gff2rntptt(self): srna_input_file = os.path.join(self.test_folder, "srna.gff") srna_output_file = os.path.join(self.test_folder, "srna.out") gff_file = os.path.join(self.test_folder, "test.gff") rnt_file = os.path.join(self.test_folder, "test.rnt") ptt_file = os.path.join(self.test_folder, "test.ptt") fasta_file = os.path.join(self.test_folder, "test.fa") with open(srna_input_file, "w") as fh: fh.write(self.gff_file) with open(gff_file, "w") as fh: fh.write(self.gff_file) with open(fasta_file, "w") as fh: fh.write(self.fasta_file) self.converter.convert_gff2rntptt( gff_file, fasta_file, ptt_file, rnt_file, srna_input_file, srna_output_file) self.assertTrue(srna_output_file) self.assertTrue(rnt_file) self.assertTrue(ptt_file) def test_convert_embl2gff(self): embl_file = os.path.join(self.test_folder, "test.embl") gff_file = os.path.join(self.test_folder, "test.embl_out") with open(embl_file, "w") as eh: for line in self.embl_file.split("\n"): eh.write(line + "\n") self.converter.convert_embl2gff(embl_file, gff_file) datas = import_data(gff_file) self.assertEqual(set(datas[1:-2]), set(self.gff_out.split("\n"))) def test_convert_transtermhp2gff(self): transterm_file = os.path.join( self.test_folder, "test_best_terminator_after_gene.bag") gff_file = os.path.join(self.test_folder, "transterm.gff") with open(transterm_file, "w") as th: th.write(self.transterm) self.converter.convert_transtermhp2gff(transterm_file, gff_file) datas = import_data(gff_file) self.assertEqual(set(datas), set(self.term_file.split("\n"))) def get_info(datas): f_datas = [] for data in datas: if not data.startswith("#"): f_datas.append("\t".join(data.split("\t")[:8])) return f_datas def test_convert_circ2gff(self): circ_file = os.path.join(self.test_folder, "circ.csv") out_all = os.path.join(self.test_folder, "all.gff") out_filter = os.path.join(self.test_folder, "best.gff") with open(circ_file, "w") as ch: ch.write(self.circ_file) args = self.mock_args.mock() args.start_ratio = 0.5 args.end_ratio = 0.5 args.support = 5 self.converter.convert_circ2gff(circ_file, args, out_all, out_filter) datas = import_data(out_all) f_datas = [] for data in datas: if not data.startswith("#"): f_datas.append("\t".join(data.split("\t")[:8])) c_datas = [] for data in self.circ_all.split("\n"): if not data.startswith("#"): c_datas.append("\t".join(data.split("\t")[:8])) self.assertListEqual(f_datas, c_datas) datas = import_data(out_filter) f_datas = [] for data in datas: if not data.startswith("#"): f_datas.append("\t".join(data.split("\t")[:8])) c_datas = [] for data in self.circ_best.split("\n"): if not data.startswith("#"): c_datas.append("\t".join(data.split("\t")[:8])) self.assertListEqual(f_datas, c_datas)
class RATT(object): '''annotation transfer''' def __init__(self, args_ratt): self.multiparser = Multiparser() self.converter = Converter() self.format_fixer = FormatFixer() self.helper = Helper() if args_ratt.ref_gbk: self.gbk = os.path.join(args_ratt.ref_gbk, "gbk_tmp") self.gbk_tmp = os.path.join(self.gbk, "tmp") self.embl = os.path.join(args_ratt.ref_gbk, "embls") if args_ratt.ref_embls: self.embl = args_ratt.ref_embls self.ratt_log = os.path.join(args_ratt.output_path, "ratt_log.txt") self.tmp_files = {"tar": os.path.join(args_ratt.tar_fastas, "tmp"), "ref": os.path.join(args_ratt.ref_fastas, "tmp"), "out_gff": os.path.join(args_ratt.gff_outfolder, "tmp"), "gff": os.path.join(args_ratt.gff_outfolder, "tmp.gff"), "ptt": os.path.join(args_ratt.gff_outfolder, "tmp.ptt"), "rnt": os.path.join(args_ratt.gff_outfolder, "tmp.rnt")} def _convert_to_pttrnt(self, gffs, files, log): for gff in files: if gff.endswith(".gff"): gff = os.path.join(gffs, gff) filename = gff.split("/") prefix = filename[-1][:-4] rnt = gff[:-3] + "rnt" ptt = gff[:-3] + "ptt" fasta = self.helper.get_correct_file(self.tmp_files["tar"], ".fa", prefix, None, None) if fasta: self.converter.convert_gff2rntptt(gff, fasta, ptt, rnt, None, None) log.write("\t" + ptt + " is generated.\n") log.write("\t" + rnt + " is generated.\n") def _remove_files(self, args_ratt, out_gbk, log): self.helper.remove_all_content(args_ratt.gff_outfolder, ".gff", "file") self.helper.remove_all_content(args_ratt.gff_outfolder, ".ptt", "file") self.helper.remove_all_content(args_ratt.gff_outfolder, ".rnt", "file") log.write("Moving the final output files to {0}.\n".format(args_ratt.gff_outfolder)) self.helper.move_all_content(self.tmp_files["out_gff"], args_ratt.gff_outfolder, None) log.write("Remove the temperary files.\n") shutil.rmtree(self.tmp_files["out_gff"]) shutil.rmtree(self.tmp_files["tar"]) shutil.rmtree(self.tmp_files["ref"]) self.helper.remove_tmp_dir(args_ratt.tar_fastas) self.helper.remove_tmp_dir(args_ratt.ref_fastas) self.helper.remove_tmp_dir(args_ratt.ref_embls) self.helper.remove_tmp_dir(args_ratt.ref_gbk) def _convert_to_gff(self, ratt_result, args_ratt, files, log): name = ratt_result.split(".") filename = ".".join(name[1:-2]) + ".gff" output_file = os.path.join(args_ratt.output_path, filename) self.converter.convert_embl2gff( os.path.join(args_ratt.output_path, ratt_result), output_file) self.format_fixer.fix_ratt(output_file, ".".join(name[1:-2]), "tmp_gff") shutil.move("tmp_gff", output_file) shutil.copy(output_file, os.path.join(args_ratt.gff_outfolder, filename)) log.write("\t" + os.path.join(args_ratt.gff_outfolder, filename) + " is generated.\n") files.append(filename) def _parser_embl_gbk(self, files): self.helper.check_make_folder(self.gbk) for file_ in files: close = False with open(file_, "r") as f_h: for line in f_h: if (line.startswith("LOCUS")): out = open(self.gbk_tmp, "w") datas = line.split(" ") for data in datas: if (len(data) != 0) and (data != "LOCUS"): filename = ".".join([data.strip(), "gbk"]) break elif (line.startswith("VERSION")): datas = line.split(" ") for data in datas: if (len(data) != 0) and (data != "VERSION"): new_filename = ".".join([data.strip(), "gbk"]) break if new_filename.find(filename): filename = new_filename if out: out.write(line) if line.startswith("//"): out.close() close = True shutil.move(self.gbk_tmp, os.path.join(self.gbk, filename)) if not close: out.close() return self.gbk def _convert_embl(self, ref_embls, log): '''convert gbk to embl''' detect_gbk = False gbks = [] out_gbk = None for embl in os.listdir(ref_embls): if (embl.endswith(".gbk")) or ( embl.endswith(".gbff")) or ( embl.endswith(".gb")): detect_gbk = True gbks.append(os.path.join(ref_embls, embl)) if not detect_gbk: log.write("--related_gbk_files is assigned, but not gbk files are detected.\n" "The gbk file names need to be ended at .gbk, .gb, or .gbff. \n") print("Error: Please assign proper Genebank files!") sys.exit() elif detect_gbk: out_gbk = self._parser_embl_gbk(gbks) log.write("Running converter.py to convert gbk file to embl format.\n") self.converter.convert_gbk2embl(out_gbk) self.helper.check_make_folder(self.embl) self.helper.move_all_content(out_gbk, self.embl, [".embl"]) log.write("\t" + self.embl + " is generated and the embl files are stored in it.\n") return out_gbk def _run_ratt(self, args_ratt, tar, ref, out, log): if (not os.path.exists(self.embl)) or ( not os.path.exists(os.path.join( self.tmp_files["tar"], tar + ".fa"))) or ( not os.path.exists(os.path.join( self.tmp_files["ref"], ref + ".fa"))): print("Error: Please check --compare_pair, the strain names " "should be the same as the strain names in fasta, " "genbank or embl files!") log.write("The strain names in --compare_pair should be the same " "as the strain names in fasta, genbank, or embl files.\n") sys.exit() log.write("Make sure your RATT version is at least 1.64.\n") log.write("If the RATT can not run properly, please check the " "RATT_HOME and PAGIT_HOME is assigned correctly.\n") log.write(" ".join([args_ratt.ratt_path, self.embl, os.path.join(self.tmp_files["tar"], tar + ".fa"), args_ratt.element, args_ratt.transfer_type, os.path.join(self.tmp_files["ref"], ref + ".fa")]) + "\n") call([args_ratt.ratt_path, self.embl, os.path.join(self.tmp_files["tar"], tar + ".fa"), args_ratt.element, args_ratt.transfer_type, os.path.join(self.tmp_files["ref"], ref + ".fa")], stdout=out, stderr=DEVNULL) log.write("Done!\n") def _format_and_run(self, args_ratt, log): print("Running RATT") for pair in args_ratt.pairs: ref = pair.split(":")[0] tar = pair.split(":")[1] out = open(self.ratt_log, "w+") self._run_ratt(args_ratt, tar, ref, out, log) log.write("The following files are generatd:\n") for filename in os.listdir(): if ("final" in filename): log.write("\t" + filename + "\n") shutil.move(filename, os.path.join(args_ratt.output_path, filename)) elif (args_ratt.element in filename) or ( "query" in filename) or ( "Reference" in filename) or ( "Query" in filename) or ( "Sequences" in filename): log.write("\t" + filename + "\n") if os.path.isfile(filename): os.remove(filename) if os.path.isdir(filename): shutil.rmtree(filename) out.close() def annotation_transfer(self, args_ratt, log): self.multiparser.parser_fasta(args_ratt.tar_fastas) self.multiparser.parser_fasta(args_ratt.ref_fastas) out_gbk = None if args_ratt.ref_embls is None: out_gbk = self._convert_embl(args_ratt.ref_gbki, log) self._format_and_run(args_ratt, log) files = [] for data in os.listdir(args_ratt.output_path): if "final.embl" in data: log.write("Running converter.py to convert embl " "files in {0} to gff, ptt, and rnt format.\n".format(data)) self._convert_to_gff(data, args_ratt, files, log) self._convert_to_pttrnt(args_ratt.gff_outfolder, files, log) self.helper.check_make_folder(self.tmp_files["out_gff"]) log.write("Merging the output of {0}.\n".format(data)) for folder in os.listdir(args_ratt.tar_fastas): files = [] if "_folder" in folder: datas = folder.split("_folder") prefix = ".".join(datas[0].split(".")[:-1]) for file_ in os.listdir(os.path.join(args_ratt.tar_fastas, folder)): files.append(file_[:-3]) for gff in os.listdir(args_ratt.gff_outfolder): for file_ in files: if (".gff" in gff) and (file_ == gff[:-4]): self.helper.merge_file(os.path.join( args_ratt.gff_outfolder, gff), self.tmp_files["gff"]) if (".ptt" in gff) and (file_ == gff[:-4]): self.helper.merge_file(os.path.join( args_ratt.gff_outfolder, gff), self.tmp_files["ptt"]) if (".rnt" in gff) and (file_ == gff[:-4]): self.helper.merge_file(os.path.join( args_ratt.gff_outfolder, gff), self.tmp_files["rnt"]) if os.path.exists(self.tmp_files["gff"]): shutil.move(self.tmp_files["gff"], os.path.join( self.tmp_files["out_gff"], prefix + ".gff")) shutil.move(self.tmp_files["ptt"], os.path.join( self.tmp_files["out_gff"], prefix + ".ptt")) shutil.move(self.tmp_files["rnt"], os.path.join( self.tmp_files["out_gff"], prefix + ".rnt")) else: print("Error: Please check your fasta or " "annotation files, they should only contain " "the query genome. And make sure your RATT can " "work properly (check $ANNOgesic/output/" "annotation_transfer/ratt_log.txt).") log.write("Please check your fasta or " "annotation files, they should only contain " "the query genome. And make sure your RATT can " "work properly (check $ANNOgesic/output/" "annotation_transfer/ratt_log.txt).\n") self._remove_files(args_ratt, out_gbk, log)