示例#1
0
 def __init__(self, args_ratt):
     self.multiparser = Multiparser()
     self.converter = Converter()
     self.format_fixer = FormatFixer()
     self.helper = Helper()
     self.gbk = os.path.join(args_ratt.ref_embls, "gbk_tmp")
     self.gbk_tmp = os.path.join(self.gbk, "tmp")
     self.embl = os.path.join(args_ratt.ref_embls, "embls")
     self.ratt_log = os.path.join(args_ratt.output_path, "ratt_log.txt")
     self.tmp_files = {
         "tar": os.path.join(args_ratt.tar_fastas, "tmp"),
         "ref": os.path.join(args_ratt.ref_fastas, "tmp"),
         "out_gff": os.path.join(args_ratt.gff_outfolder, "tmp"),
         "gff": os.path.join(args_ratt.gff_outfolder, "tmp.gff"),
         "ptt": os.path.join(args_ratt.gff_outfolder, "tmp.ptt"),
         "rnt": os.path.join(args_ratt.gff_outfolder, "tmp.rnt")
     }
示例#2
0
 def __init__(self, args_tar):
     self.multiparser = Multiparser()
     self.helper = Helper()
     self.fixer = FormatFixer()
     self.gff_parser = Gff3Parser()
     self.target_seq_path = os.path.join(args_tar.out_folder, "target_seqs")
     self.srna_seq_path = os.path.join(args_tar.out_folder, "sRNA_seqs")
     self.rnaplex_path = os.path.join(args_tar.out_folder, "RNAplex_results")
     self.rnaup_path = os.path.join(args_tar.out_folder, "RNAup_results")
     self.intarna_path = os.path.join(args_tar.out_folder, "IntaRNA_results")
     self.merge_path = os.path.join(args_tar.out_folder, "merged_results")
     self.srna_path = os.path.join(args_tar.srnas, "tmp")
     self.fasta_path = os.path.join(args_tar.fastas, "tmp")
     self.gff_path = os.path.join(args_tar.gffs, "tmp")
     self.tmps = {"tmp": "tmp_srna_target", "rnaup": "tmp_rnaup",
                  "log": "tmp_log",
                  "all_fa": "tmp*.fa", "all_txt": "tmp*.txt"}
示例#3
0
 def setUp(self):
     self.fixer = FormatFixer()
     self.example = Example()
     self.ratt_out = self.example.ratt_out
     self.rnaplex_out = self.example.rnaplex_out
     self.emboss_out = self.example.emboss_out
     self.test_folder = "test_folder"
     if (not os.path.exists(self.test_folder)):
         os.mkdir(self.test_folder)
     self.ratt_file = os.path.join(self.test_folder, "ratt.gff")
     with open(self.ratt_file, "w") as rh:
         rh.write(self.example.ratt_gff)
     self.rnaplex_file = os.path.join(self.test_folder, "rnaplex.txt")
     with open(self.rnaplex_file, "w") as rh:
         rh.write(self.example.rnaplex_file)
     self.emboss_file = os.path.join(self.test_folder, "emboss.txt")
     with open(self.emboss_file, "w") as rh:
         rh.write(self.example.emboss_file)
示例#4
0
 def __init__(self, args_sub):
     self.multiparser = Multiparser()
     self.helper = Helper()
     self.fixer = FormatFixer()
     self.gff_path = os.path.join(args_sub.gffs, "tmp")
     self.fasta_path = os.path.join(args_sub.fastas, "tmp")
     if args_sub.trans is not None:
         self.tran_path = os.path.join(args_sub.trans, "tmp")
     else:
         self.tran_path = None
     self.out_all = os.path.join(args_sub.out_folder, "all_CDS")
     self.out_express = os.path.join(args_sub.out_folder, "expressed_CDS")
     self.all_tmp_path = os.path.join(self.out_all, "tmp")
     self.express_tmp_path = os.path.join(self.out_express, "tmp")
     self.all_stat_path = os.path.join(self.out_all, "statistics")
     self.express_stat_path = os.path.join(self.out_express, "statistics")
     self.all_tmp_result = os.path.join(self.out_all, "tmp_results")
     self.express_tmp_result = os.path.join(self.out_express, "tmp_results")
     self.all_result = os.path.join(self.out_all, "psortb_results")
     self.express_result = os.path.join(self.out_express, "psortb_results")
     self.endfix_table = "table.csv"
     self.endfix_raw = "raw.txt"
     self._make_folder()
示例#5
0
 def __init__(self, args_tar):
     self.multiparser = Multiparser()
     self.helper = Helper()
     self.fixer = FormatFixer()
     self.gff_parser = Gff3Parser()
     self.target_seq_path = os.path.join(args_tar.out_folder, "target_seqs")
     self.srna_seq_path = os.path.join(args_tar.out_folder, "sRNA_seqs")
     self.rnaplex_path = os.path.join(args_tar.out_folder, "RNAplex")
     self.rnaup_path = os.path.join(args_tar.out_folder, "RNAup")
     self.merge_path = os.path.join(args_tar.out_folder, "merge")
     self.srna_path = os.path.join(args_tar.srnas, "tmp")
     self.fasta_path = os.path.join(args_tar.fastas, "tmp")
     self.gff_path = os.path.join(args_tar.gffs, "tmp")
     self.tmps = {"tmp": "tmp", "rnaup": "tmp_rnaup", "log": "tmp_log",
                  "all_fa": "tmp*.fa", "all_txt": "tmp*.txt"}
示例#6
0
 def setUp(self):
     self.fixer = FormatFixer()
     self.example = Example()
     self.ratt_out = self.example.ratt_out
     self.rnaplex_out = self.example.rnaplex_out
     self.emboss_out = self.example.emboss_out
     self.test_folder = "test_folder"
     if (not os.path.exists(self.test_folder)):
         os.mkdir(self.test_folder)
     self.ratt_file = os.path.join(self.test_folder, "ratt.gff")
     with open(self.ratt_file, "w") as rh:
         rh.write(self.example.ratt_gff)
     self.rnaplex_file = os.path.join(self.test_folder, "rnaplex.txt")
     with open(self.rnaplex_file, "w") as rh:
         rh.write(self.example.rnaplex_file)
     self.emboss_file = os.path.join(self.test_folder, "emboss.txt")
     with open(self.emboss_file, "w") as rh:
         rh.write(self.example.emboss_file)
示例#7
0
 def __init__(self, args_ratt):
     self.multiparser = Multiparser()
     self.converter = Converter()
     self.format_fixer = FormatFixer()
     self.helper = Helper()
     self.gbk = os.path.join(args_ratt.ref_embls, "gbk_tmp")
     self.gbk_tmp = os.path.join(self.gbk, "tmp")
     self.embl = os.path.join(args_ratt.ref_embls, "embls")
     self.ratt_log = os.path.join(args_ratt.output_path, "ratt_log.txt")
     self.tmp_files = {"tar": os.path.join(args_ratt.tar_fastas, "tmp"),
                       "ref": os.path.join(args_ratt.ref_fastas, "tmp"),
                       "out_gff": os.path.join(args_ratt.gff_outfolder,
                                               "tmp"),
                       "gff": os.path.join(args_ratt.gff_outfolder,
                                           "tmp.gff"),
                       "ptt": os.path.join(args_ratt.gff_outfolder,
                                           "tmp.ptt"),
                       "rnt": os.path.join(args_ratt.gff_outfolder,
                                           "tmp.rnt")}
示例#8
0
 def __init__(self, args_sub):
     self.multiparser = Multiparser()
     self.helper = Helper()
     self.fixer = FormatFixer()
     self.gff_path = os.path.join(args_sub.gffs, "tmp")
     self.fasta_path = os.path.join(args_sub.fastas, "tmp")
     if args_sub.trans is not None:
         self.tran_path = os.path.join(args_sub.trans, "tmp")
     else:
         self.tran_path = None
     self.out_all = os.path.join(args_sub.out_folder, "all_CDS")
     self.out_express = os.path.join(args_sub.out_folder, "expressed_CDS")
     self.all_tmp_path = os.path.join(self.out_all, "tmp")
     self.express_tmp_path = os.path.join(self.out_express, "tmp")
     self.all_stat_path = os.path.join(self.out_all, "statistics")
     self.express_stat_path = os.path.join(self.out_express, "statistics")
     self.all_tmp_result = os.path.join(self.out_all, "tmp_results")
     self.express_tmp_result = os.path.join(self.out_express, "tmp_results")
     self.all_result = os.path.join(self.out_all, "psortb_results")
     self.express_result = os.path.join(self.out_express, "psortb_results")
     self.endfix_table = "table.csv"
     self.endfix_raw = "raw.txt"
     self._make_folder()
示例#9
0
class TestFormatFixer(unittest.TestCase):

    def setUp(self):
        self.fixer = FormatFixer()
        self.example = Example()
        self.ratt_out = self.example.ratt_out
        self.rnaplex_out = self.example.rnaplex_out
        self.emboss_out = self.example.emboss_out
        self.test_folder = "test_folder"
        if (not os.path.exists(self.test_folder)):
            os.mkdir(self.test_folder)
        self.ratt_file = os.path.join(self.test_folder, "ratt.gff")
        with open(self.ratt_file, "w") as rh:
            rh.write(self.example.ratt_gff)
        self.rnaplex_file = os.path.join(self.test_folder, "rnaplex.txt")
        with open(self.rnaplex_file, "w") as rh:
            rh.write(self.example.rnaplex_file)
        self.emboss_file = os.path.join(self.test_folder, "emboss.txt")
        with open(self.emboss_file, "w") as rh:
            rh.write(self.example.emboss_file)

    def tearDown(self):
        if os.path.exists(self.test_folder):
            shutil.rmtree(self.test_folder)

    def test_fix_ratt(self):
        out = os.path.join(self.test_folder, "ratt.out")
        self.fixer.fix_ratt(self.ratt_file, "Staphylococcus_aureus_HG003", out)
        datas = import_data(out)
        self.assertEqual(set(datas), set(self.ratt_out.split("\n")))

    def test_fix_rnaplex(self):
        out_file = os.path.join(self.test_folder, "rnaplex.out")
        self.fixer.fix_rnaplex(self.rnaplex_file, out_file)
        datas = import_data(out_file)
        self.assertEqual(set(datas), set(self.rnaplex_out.split("\n")))

    def test_fix_emboss(self):
        out_file = os.path.join(self.test_folder, "emboss.out")
        self.fixer.fix_emboss(self.emboss_file, out_file)
        datas = import_data(out_file)
        self.assertEqual(set(datas), set(self.emboss_out.split("\n")))
示例#10
0
class TestFormatFixer(unittest.TestCase):
    def setUp(self):
        self.fixer = FormatFixer()
        self.example = Example()
        self.ratt_out = self.example.ratt_out
        self.rnaplex_out = self.example.rnaplex_out
        self.emboss_out = self.example.emboss_out
        self.test_folder = "test_folder"
        if (not os.path.exists(self.test_folder)):
            os.mkdir(self.test_folder)
        self.ratt_file = os.path.join(self.test_folder, "ratt.gff")
        with open(self.ratt_file, "w") as rh:
            rh.write(self.example.ratt_gff)
        self.rnaplex_file = os.path.join(self.test_folder, "rnaplex.txt")
        with open(self.rnaplex_file, "w") as rh:
            rh.write(self.example.rnaplex_file)
        self.emboss_file = os.path.join(self.test_folder, "emboss.txt")
        with open(self.emboss_file, "w") as rh:
            rh.write(self.example.emboss_file)

    def tearDown(self):
        if os.path.exists(self.test_folder):
            shutil.rmtree(self.test_folder)

    def test_fix_ratt(self):
        out = os.path.join(self.test_folder, "ratt.out")
        self.fixer.fix_ratt(self.ratt_file, "Staphylococcus_aureus_HG003", out)
        datas = import_data(out)
        self.assertEqual(set(datas), set(self.ratt_out.split("\n")))

    def test_fix_rnaplex(self):
        out_file = os.path.join(self.test_folder, "rnaplex.out")
        self.fixer.fix_rnaplex(self.rnaplex_file, out_file)
        datas = import_data(out_file)
        self.assertEqual(set(datas), set(self.rnaplex_out.split("\n")))

    def test_fix_emboss(self):
        out_file = os.path.join(self.test_folder, "emboss.out")
        self.fixer.fix_emboss(self.emboss_file, out_file)
        datas = import_data(out_file)
        self.assertEqual(set(datas), set(self.emboss_out.split("\n")))
示例#11
0
class RATT(object):

    def __init__(self, args_ratt):
        self.multiparser = Multiparser()
        self.converter = Converter()
        self.format_fixer = FormatFixer()
        self.helper = Helper()
        self.gbk = os.path.join(args_ratt.ref_embls, "gbk_tmp")
        self.gbk_tmp = os.path.join(self.gbk, "tmp")
        self.embl = os.path.join(args_ratt.ref_embls, "embls")
        self.ratt_log = os.path.join(args_ratt.output_path, "ratt_log.txt")
        self.tmp_files = {"tar": os.path.join(args_ratt.tar_fastas, "tmp"),
                          "ref": os.path.join(args_ratt.ref_fastas, "tmp"),
                          "out_gff": os.path.join(args_ratt.gff_outfolder,
                                                  "tmp"),
                          "gff": os.path.join(args_ratt.gff_outfolder,
                                              "tmp.gff"),
                          "ptt": os.path.join(args_ratt.gff_outfolder,
                                              "tmp.ptt"),
                          "rnt": os.path.join(args_ratt.gff_outfolder,
                                              "tmp.rnt")}

    def _convert_to_pttrnt(self, gffs, files):
        for gff in files:
            if gff.endswith(".gff"):
                gff = os.path.join(gffs, gff)
                filename = gff.split("/")
                prefix = filename[-1][:-4]
                rnt = gff[:-3] + "rnt"
                ptt = gff[:-3] + "ptt"
                fasta = self.helper.get_correct_file(self.tmp_files["tar"],
                                                     ".fa", prefix, None, None)
                if fasta:
                    self.converter.convert_gff2rntptt(gff, fasta, ptt, rnt,
                                                      None, None)

    def _remove_files(self, args_ratt, out_gbk):
        self.helper.remove_all_content(args_ratt.gff_outfolder, ".gff", "file")
        self.helper.remove_all_content(args_ratt.gff_outfolder, ".ptt", "file")
        self.helper.remove_all_content(args_ratt.gff_outfolder, ".rnt", "file")
        self.helper.move_all_content(self.tmp_files["out_gff"],
                                     args_ratt.gff_outfolder, None)
        shutil.rmtree(self.tmp_files["out_gff"])
        shutil.rmtree(self.tmp_files["tar"])
        shutil.rmtree(self.tmp_files["ref"])
        shutil.rmtree(self.embl)
        self.helper.remove_all_content(args_ratt.tar_fastas, "_folder", "dir")
        self.helper.remove_all_content(args_ratt.ref_fastas, "_folder", "dir")
        if out_gbk:
            shutil.rmtree(out_gbk)

    def _convert_to_gff(self, ratt_result, args_ratt, files):
        name = ratt_result.split(".")
        filename = ".".join(name[1:-2]) + ".gff"
        output_file = os.path.join(args_ratt.output_path, filename)
        self.converter.convert_embl2gff(
             os.path.join(args_ratt.output_path, ratt_result), output_file)
        self.format_fixer.fix_ratt(output_file, ".".join(name[1:-2]),
                                   "tmp_gff")
        shutil.move("tmp_gff", output_file)
        shutil.copy(output_file, os.path.join(args_ratt.gff_outfolder,
                                              filename))
        files.append(filename)

    def _parser_embl_gbk(self, files):
        self.helper.check_make_folder(self.gbk)
        for file_ in files:
            close = False
            with open(file_, "r") as f_h:
                for line in f_h:
                    if (line.startswith("LOCUS")):
                        out = open(self.gbk_tmp, "w")
                        datas = line.split(" ")
                        for data in datas:
                            if (len(data) != 0) and (data != "LOCUS"):
                                filename = ".".join([data, "gbk"])
                                break
                    elif (line.startswith("VERSION")):
                        datas = line.split(" ")
                        for data in datas:
                            if (len(data) != 0) and (data != "VERSION"):
                                new_filename = ".".join([data, "gbk"])
                                break
                        if new_filename.find(filename):
                            filename = new_filename
                    if out:
                        out.write(line)
                    if line.startswith("//"):
                        out.close()
                        close = True
                        shutil.move(self.gbk_tmp,
                                    os.path.join(self.gbk, filename))
            if not close:
                out.close()
        return self.gbk

    def _convert_embl(self, ref_embls):
        detect_gbk = False
        gbks = []
        out_gbk = None
        for embl in os.listdir(ref_embls):
            if embl.endswith(".gbk"):
                detect_gbk = True
                gbks.append(os.path.join(ref_embls, embl))
        if not detect_gbk:
            print("Error: please assign proper folder for Genebank file!!!")
            sys.exit()
        elif detect_gbk:
            out_gbk = self._parser_embl_gbk(gbks)
            self.converter.convert_gbk2embl(out_gbk)
            self.helper.check_make_folder(self.embl)
            self.helper.move_all_content(out_gbk, self.embl, [".embl"])
        return out_gbk

    def _run_ratt(self, args_ratt, tar, ref, out):
        call([args_ratt.ratt_path, self.embl,
              os.path.join(self.tmp_files["tar"], tar + ".fa"),
              args_ratt.element, args_ratt.transfer_type,
              os.path.join(self.tmp_files["ref"], ref + ".fa")],
             stdout=out, stderr=DEVNULL)

    def _format_and_run(self, args_ratt):
        print("Running RATT...")
        for pair in args_ratt.pairs:
            ref = pair.split(":")[0]
            tar = pair.split(":")[1]
            out = open(self.ratt_log, "w+")
            print(tar)
            self._run_ratt(args_ratt, tar, ref, out)
            for filename in os.listdir():
                if ("final" in filename):
                    shutil.move(filename, os.path.join(args_ratt.output_path,
                                                       filename))
                elif (args_ratt.element in filename) or (
                      "query" in filename) or (
                      "Reference" in filename) or (
                      "Query" in filename) or (
                      "Sequences" in filename):
                    if os.path.isfile(filename):
                        os.remove(filename)
                    if os.path.isdir(filename):
                        shutil.rmtree(filename)
        out.close()

    def annotation_transfer(self, args_ratt):
        self.multiparser.parser_fasta(args_ratt.tar_fastas)
        self.multiparser.parser_fasta(args_ratt.ref_fastas)
        out_gbk = self._convert_embl(args_ratt.ref_embls)
        self._format_and_run(args_ratt)
        if args_ratt.convert:
            files = []
            for data in os.listdir(args_ratt.output_path):
                if "final.embl" in data:
                    self._convert_to_gff(data, args_ratt, files)
                    self._convert_to_pttrnt(args_ratt.gff_outfolder, files)
            self.helper.check_make_folder(self.tmp_files["out_gff"])
            for folder in os.listdir(args_ratt.tar_fastas):
                files = []
                if "_folder" in folder:
                    datas = folder.split("_folder")
                    prefix = datas[0][:-3]
                    for file_ in os.listdir(os.path.join(args_ratt.tar_fastas,
                                                         folder)):
                        files.append(file_[:-3])
                    for gff in os.listdir(args_ratt.gff_outfolder):
                        for file_ in files:
                            if (".gff" in gff) and (file_ == gff[:-4]):
                                self.helper.merge_file(os.path.join(
                                     args_ratt.gff_outfolder, gff),
                                     self.tmp_files["gff"])
                            if (".ptt" in gff) and (file_ == gff[:-4]):
                                self.helper.merge_file(os.path.join(
                                     args_ratt.gff_outfolder, gff),
                                     self.tmp_files["ptt"])
                            if (".rnt" in gff) and (file_ == gff[:-4]):
                                self.helper.merge_file(os.path.join(
                                     args_ratt.gff_outfolder, gff),
                                     self.tmp_files["rnt"])
                    shutil.move(self.tmp_files["gff"], os.path.join(
                                self.tmp_files["out_gff"], prefix + ".gff"))
                    shutil.move(self.tmp_files["ptt"], os.path.join(
                                self.tmp_files["out_gff"], prefix + ".ptt"))
                    shutil.move(self.tmp_files["rnt"], os.path.join(
                                self.tmp_files["out_gff"], prefix + ".rnt"))
        self._remove_files(args_ratt, out_gbk)
示例#12
0
class sRNATargetPrediction(object):
    '''detection of sRNA-target interaction'''

    def __init__(self, args_tar):
        self.multiparser = Multiparser()
        self.helper = Helper()
        self.fixer = FormatFixer()
        self.gff_parser = Gff3Parser()
        self.target_seq_path = os.path.join(args_tar.out_folder, "target_seqs")
        self.srna_seq_path = os.path.join(args_tar.out_folder, "sRNA_seqs")
        self.rnaplex_path = os.path.join(args_tar.out_folder, "RNAplex_results")
        self.rnaup_path = os.path.join(args_tar.out_folder, "RNAup_results")
        self.intarna_path = os.path.join(args_tar.out_folder, "IntaRNA_results")
        self.merge_path = os.path.join(args_tar.out_folder, "merged_results")
        self.srna_path = os.path.join(args_tar.srnas, "tmp")
        self.fasta_path = os.path.join(args_tar.fastas, "tmp")
        self.gff_path = os.path.join(args_tar.gffs, "tmp")
        self.tmps = {"tmp": "tmp_srna_target", "rnaup": "tmp_rnaup",
                     "log": "tmp_log",
                     "all_fa": "tmp*.fa", "all_txt": "tmp*.txt"}

    def _check_gff(self, gffs):
        for gff in os.listdir(gffs):
            if gff.endswith(".gff"):
                self.helper.check_uni_attributes(os.path.join(gffs, gff))

    def _run_rnaplfold(self, rnaplfold_path, file_type, win_size, span,
                       unstr_region, seq_path, prefix, out_path, log):
        current = os.getcwd()
        os.chdir(out_path)
        command = " ".join([rnaplfold_path,
                            "-W", str(win_size),
                            "-L", str(span),
                            "-u", str(unstr_region),
                            "-O"])
        if file_type == "sRNA":
            log.write("<".join([command, os.path.join(current, seq_path,
                                "_".join([self.tmps["tmp"], prefix,
                                          file_type + ".fa"]))]) + "\n")
            os.system("<".join([command, os.path.join(current, seq_path,
                                "_".join([self.tmps["tmp"], prefix,
                                          file_type + ".fa"]))]))
        else:
            log.write("<".join([command, os.path.join(current, seq_path,
                                "_".join([prefix, file_type + ".fa"]))]) + "\n")
            os.system("<".join([command, os.path.join(current, seq_path,
                                "_".join([prefix, file_type + ".fa"]))]))
        os.chdir(current)

    def _wait_process(self, processes):
        for p in processes:
            p.wait()
            if p.stdout:
                p.stdout.close()
            if p.stdin:
                p.stdin.close()
            if p.stderr:
                p.stderr.close()
            try:
                p.kill()
            except OSError:
                pass
            time.sleep(5)

    def _sort_srna_fasta(self, fasta, prefix, path):
        out = open(os.path.join(path,
                   "_".join([self.tmps["tmp"], prefix, "sRNA.fa"])), "w")
        srnas = []
        with open(fasta) as f_h:
            for line in f_h:
                line = line.strip()
                if line.startswith(">"):
                    name = line[1:]
                else:
                    srnas.append({"name": name, "seq": line, "len": len(line)})
        srnas = sorted(srnas, key=lambda x: (x["len"]))
        for srna in srnas:
            out.write(">" + srna["name"].split("|")[0] + "\n")
            out.write(srna["seq"] + "\n")
        out.close()

    def _read_fasta(self, fasta_file):
        seq = ""
        with open(fasta_file, "r") as seq_f:
            for line in seq_f:
                line = line.strip()
                if line.startswith(">"):
                    continue
                else:
                    seq = seq + line
        return seq

    def _get_specific_seq(self, srna_file, seq_file, srna_out, querys):
        for query in querys:
            srna_datas = query.split(":")
            srna = {"seq_id": srna_datas[0], "strand": srna_datas[3],
                    "start": int(srna_datas[1]), "end": int(srna_datas[2])}
            gff_f = open(srna_file, "r")
            out = open(srna_out, "a")
            seq = self._read_fasta(seq_file)
            num = 0
            detect = False
            for entry in self.gff_parser.entries(gff_f):
                if (entry.seq_id == srna["seq_id"]) and (
                        entry.strand == srna["strand"]) and (
                        entry.start == srna["start"]) and (
                        entry.end == srna["end"]):
                    detect = True
                    if "ID" in entry.attributes.keys():
                        id_ = entry.attributes["ID"]
                    else:
                        id_ = entry.feature + str(num)
                    gene = self.helper.extract_gene(seq, entry.start,
                                                    entry.end, entry.strand)
                    out.write(">{0}|{1}|{2}|{3}|{4}\n{5}\n".format(
                              id_, entry.seq_id, entry.start,
                              entry.end, entry.strand, gene))
                    num += 1
            if not detect:
                print("Error: Some of the query sRNAs do not exist!")
                sys.exit()
            gff_f.close()
            out.close()

    def _gen_seq(self, prefixs, args_tar):
        print("Generating sRNA fasta files")
        for srna in os.listdir(self.srna_path):
            if srna.endswith("_sRNA.gff"):
                prefix = srna.replace("_sRNA.gff", "")
                prefixs.append(prefix)
                srna_out = os.path.join(self.srna_seq_path,
                                        "_".join([prefix, "sRNA.fa"]))
                if "all" in args_tar.query:
                    self.helper.get_seq(
                            os.path.join(self.srna_path, srna),
                            os.path.join(self.fasta_path, prefix + ".fa"),
                            srna_out)
                else:
                    if "_".join([prefix, "sRNA.fa"]) in os.listdir(
                       self.srna_seq_path):
                        os.remove(srna_out)
                    self._get_specific_seq(
                            os.path.join(self.srna_path, srna),
                            os.path.join(self.fasta_path, prefix + ".fa"),
                            srna_out, args_tar.query)
                self._sort_srna_fasta(srna_out, prefix, self.srna_seq_path)
        print("Generating target fasta files")
        for gff in os.listdir(self.gff_path):
            if gff.endswith(".gff"):
                prefix = gff.replace(".gff", "")
                potential_target(os.path.join(self.gff_path, gff),
                                 os.path.join(self.fasta_path, prefix + ".fa"),
                                 os.path.join(self.target_seq_path), args_tar)
                file_num = 1
                num = 0
                sub_prefix = os.path.join(self.target_seq_path,
                                          "_".join([prefix, "target"]))
                sub_out = open("_".join([sub_prefix, str(file_num) + ".fa"]),
                               "w")
                with open((sub_prefix + ".fa"), "r") as t_f:
                    for line in t_f:
                        line = line.strip()
                        if line.startswith(">"):
#                            line = line.replace("|", "_")
                            num += 1
                        if (num == 100):
                            num = 0
                            file_num += 1
                            sub_out.close()
                            sub_out = open("_".join([sub_prefix,
                                           str(file_num) + ".fa"]), "w")
                        sub_out.write(line + "\n")
                sub_out.close()

    def _run_rnaplex(self, prefix, rnaplfold_folder, args_tar, log):
        print("Running RNAplex of {0}".format(prefix))
        num_process = 0
        processes = []
        for seq in os.listdir(self.target_seq_path):
            if (prefix in seq) and ("_target_" in seq):
                print("Running RNAplex with {0}".format(seq))
                out_rnaplex = open(os.path.join(
                    self.rnaplex_path, prefix, "_".join([
                        prefix, "RNAplex", str(num_process) + ".txt"])), "w")
                num_process += 1
                log.write(" ".join([args_tar.rnaplex_path,
                           "-q", os.path.join(
                               self.srna_seq_path, "_".join([
                                   self.tmps["tmp"], prefix, "sRNA.fa"])),
                           "-t", os.path.join(self.target_seq_path, seq),
                           "-l", str(args_tar.inter_length),
                           "-e", str(args_tar.energy),
                           "-z", str(args_tar.duplex_dist),
                           "-a", rnaplfold_folder]) + "\n")
                p = Popen([args_tar.rnaplex_path,
                           "-q", os.path.join(
                               self.srna_seq_path, "_".join([
                                   self.tmps["tmp"], prefix, "sRNA.fa"])),
                           "-t", os.path.join(self.target_seq_path, seq),
                           "-l", str(args_tar.inter_length),
                           "-e", str(args_tar.energy),
                           "-z", str(args_tar.duplex_dist),
                           "-a", rnaplfold_folder], stdout=out_rnaplex)
                processes.append(p)
                if num_process % args_tar.core_plex == 0:
                    self._wait_process(processes)
        self._wait_process(processes)
        log.write("The prediction for {0} is done.\n".format(prefix))
        log.write("The following temporary files for storing results of {0} are "
                  "generated:\n".format(prefix))
        for file_ in os.listdir(os.path.join(self.rnaplex_path, prefix)):
            log.write("\t" + os.path.join(self.rnaplex_path, prefix, file_) + "\n")
        return num_process

    def _rna_plex(self, prefixs, args_tar, log):
        log.write("Using RNAplex and RNAplfold to predict sRNA targets.\n")
        log.write("Please make sure the version of Vienna RNA package is "
                  "at least 2.3.2.\n")
        for prefix in prefixs:
            print("Running RNAplfold of {0}".format(prefix))
            self.helper.check_make_folder(
                        os.path.join(self.rnaplex_path, prefix))
            rnaplfold_folder = os.path.join(self.rnaplex_path, prefix,
                                          "RNAplfold")
            os.mkdir(rnaplfold_folder)
            self._run_rnaplfold(
                args_tar.rnaplfold_path, "sRNA", args_tar.win_size_s,
                args_tar.span_s, args_tar.unstr_region_rnaplex_s,
                self.srna_seq_path, prefix, rnaplfold_folder, log)
            self._run_rnaplfold(
                args_tar.rnaplfold_path, "target", args_tar.win_size_t,
                args_tar.span_t, args_tar.unstr_region_rnaplex_t,
                self.target_seq_path, prefix, rnaplfold_folder, log)
            num_process = self._run_rnaplex(prefix, rnaplfold_folder, args_tar, log)
            rnaplex_file = os.path.join(self.rnaplex_path, prefix,
                                        "_".join([prefix, "RNAplex.txt"]))
            if ("_".join([prefix, "RNAplex.txt"]) in
                    os.listdir(os.path.join(self.rnaplex_path, prefix))):
                os.remove(rnaplex_file)
            for index in range(0, num_process):
                log.write("Using helper.py to merge the temporary files.\n")
                self.helper.merge_file(os.path.join(
                    self.rnaplex_path, prefix, "_".join([
                        prefix, "RNAplex", str(index) + ".txt"])),
                    rnaplex_file)
            log.write("\t" + rnaplex_file + " is generated.\n")
            self.helper.remove_all_content(os.path.join(
                 self.rnaplex_path, prefix), "_RNAplex_", "file")
            self.fixer.fix_rnaplex(rnaplex_file, self.tmps["tmp"])
            shutil.move(self.tmps["tmp"], rnaplex_file)
            shutil.rmtree(rnaplfold_folder)

    def _run_rnaup(self, num_up, processes, prefix, out_rnaup, out_log,
                   args_tar, log):
        for index in range(1, num_up + 1):
            out_tmp_up = open(os.path.join(
                args_tar.out_folder, "".join([self.tmps["rnaup"],
                                              str(index), ".txt"])), "w")
            out_err = open(os.path.join(
                args_tar.out_folder, "".join([self.tmps["log"],
                                              str(index), ".txt"])), "w")
            in_up = open(os.path.join(
                args_tar.out_folder, "".join([self.tmps["tmp"],
                                              str(index), ".fa"])), "r")
            log.write(" ".join([args_tar.rnaup_path,
                       "-u", str(args_tar.unstr_region_rnaup),
                       "-o", "--interaction_first"]) + "\n")
            p = Popen([args_tar.rnaup_path,
                       "-u", str(args_tar.unstr_region_rnaup),
                       "-o", "--interaction_first"],
                      stdin=in_up, stdout=out_tmp_up, stderr=out_err)
            processes.append(p)
        if len(processes) != 0:
            time.sleep(5)
            self._wait_process(processes)
            log.write("The following temporary files for storing results of {0} are "
                      "generated:\n".format(prefix))
            for file_ in os.listdir(os.path.join(args_tar.out_folder)):
                log.write("\t" + os.path.join(args_tar.out_folder, file_) + "\n")
            os.system("rm " + os.path.join(args_tar.out_folder,
                                           self.tmps["all_fa"]))
            self._merge_txt(num_up, out_rnaup, out_log, args_tar.out_folder)
            os.system("rm " + os.path.join(args_tar.out_folder,
                                           self.tmps["all_txt"]))

    def _merge_txt(self, num_up, out_rnaup, out_log, out_folder):
        for index in range(1, num_up + 1):
            self.helper.merge_file(
                os.path.join(out_folder, "".join([self.tmps["rnaup"],
                                                  str(index), ".txt"])),
                out_rnaup)
            self.helper.merge_file(
                os.path.join(out_folder, "".join([self.tmps["log"],
                                                  str(index), ".txt"])),
                out_log)

    def _get_continue(self, out_rnaup):
        '''For RNAup, it can continue running RNAup based on previous run'''
        srnas = []
        matchs = {}
        out = open("tmp.txt", "w")
        with open(out_rnaup) as f_h:
            for line in f_h:
                line = line.strip()
                if ">srna" in line:
                    srna = line[1:]
                    srnas.append(srna)
                    matchs[srna] = []
                else:
                    matchs[srna].append(line)
        srnas = srnas[:-1]
        for srna in srnas:
            out.write(">" + srna + "\n")
            for target in matchs[srna]:
                out.write(target + "\n")
        out.close()
        os.remove(out_rnaup)
        shutil.move("tmp.txt", out_rnaup)
        return srnas

    def _rnaup(self, prefixs, args_tar, log):
        log.write("Using RNAup to predict sRNA targets.\n")
        log.write("Please make sure the version of Vienna RNA package is "
                  "at least 2.3.2.\n")
        for prefix in prefixs:
            srnas = []
            print("Running RNAup of {0}".format(prefix))
            if not os.path.exists(os.path.join(self.rnaup_path, prefix)):
                os.mkdir(os.path.join(self.rnaup_path, prefix))
            num_up = 0
            processes = []
            out_rnaup = os.path.join(self.rnaup_path, prefix,
                                     "_".join([prefix + "_RNAup.txt"]))
            out_log = os.path.join(self.rnaup_path, prefix,
                                   "_".join([prefix + "_RNAup.log"]))
            if "_".join([prefix, "RNAup.txt"]) in \
                    os.listdir(os.path.join(self.rnaup_path, prefix)):
                if not args_tar.continue_rnaup:
                    os.remove(out_rnaup)
                    os.remove(out_log)
                else:
                    log.write("The data from the previous run is found.\n")
                    srnas = self._get_continue(out_rnaup)
                    log.write("The previous data is loaded.\n")
            with open(os.path.join(self.srna_seq_path, "_".join([
                    self.tmps["tmp"], prefix, "sRNA.fa"])), "r") as s_f:
                for line in s_f:
                    line = line.strip()
                    if line.startswith(">"):
                        if line[1:] in srnas:
                            start = False
                            continue
                        start = True
                        print("Running RNAup with {0}".format(line[1:]))
                        num_up += 1
                        out_up = open(os.path.join(args_tar.out_folder,
                                      "".join([self.tmps["tmp"],
                                               str(num_up), ".fa"])), "w")
                        out_up.write(line + "\n")
                    else:
                        if start:
                            out_up.write(line + "\n")
                            out_up.close()
                            self.helper.merge_file(os.path.join(
                                self.target_seq_path,
                                "_".join([prefix, "target.fa"])),
                                os.path.join(args_tar.out_folder,
                                             "".join([self.tmps["tmp"],
                                                      str(num_up), ".fa"])))
                            if num_up == args_tar.core_up:
                                self._run_rnaup(num_up, processes, prefix,
                                                out_rnaup, out_log, args_tar, log)
                                processes = []
                                num_up = 0
            self._run_rnaup(num_up, processes, prefix, out_rnaup, out_log,
                            args_tar, log)
            log.write("The prediction for {0} is done.\n".format(prefix))
            log.write("\t" + out_rnaup + " is complete generated and updated.\n")

    def _intarna(self, prefixs, args_tar, log):
        log.write("Using IntaRNA to predict sRNA targets.\n")
        log.write("Please make sure the version of IntaRNA is at least 2.0.4.\n")
        for prefix in prefixs:
            print("Running IntaRNA of {0}".format(prefix))
            intarna_file = os.path.join(self.intarna_path, prefix,
                                        prefix + "_IntaRNA.txt")
            self.helper.check_make_folder(
                        os.path.join(self.intarna_path, prefix))
            call([args_tar.intarna_path,
                  "-q", os.path.join(
                      self.srna_seq_path, "_".join([
                          self.tmps["tmp"], prefix, "sRNA.fa"])),
                  "-t", os.path.join(self.target_seq_path,
                                     prefix + "_target.fa"),
                  "--qAccW", str(args_tar.slide_win_srna),
                  "--qAccL", str(args_tar.max_loop_srna),
                  "--tAccW", str(args_tar.slide_win_target),
                  "--tAccL", str(args_tar.max_loop_target),
                  "--outMode", "C", "-m", args_tar.mode_intarna,
                  "--threads", str(args_tar.core_inta),
                  "--out", intarna_file])
            log.write("The prediction for {0} is done.\n".format(prefix))
            log.write("\t" + intarna_file + " is generated.\n")

    def _merge_rnaplex_rnaup(self, prefixs, args_tar, log):
        '''merge the result of IntaRNA, RNAup and RNAplex'''
        log.write("Running merge_rnaplex_rnaup.py to merge the results from "
                  "RNAplex, RNAup, and IntaRNA for generating finanl output.\n")
        log.write("The following files are generated:\n")
        for prefix in prefixs:
            rnaplex_file = None
            rnaup_file = None
            out_rnaplex = None
            out_rnaup = None
            intarna_file = None
            out_intarna = None
            self.helper.check_make_folder(os.path.join(
                                          self.merge_path, prefix))
            print("Ranking {0} now".format(prefix))
            if ("RNAplex" in args_tar.program):
                rnaplex_file = os.path.join(self.rnaplex_path, prefix,
                                            "_".join([prefix, "RNAplex.txt"]))
                out_rnaplex = os.path.join(
                        self.rnaplex_path, prefix,
                        "_".join([prefix, "RNAplex_rank.csv"]))
                self._remove_repeat(rnaplex_file, "RNAplex")
            if ("RNAup" in args_tar.program):
                rnaup_file = os.path.join(self.rnaup_path, prefix,
                                          "_".join([prefix, "RNAup.txt"]))
                out_rnaup = os.path.join(self.rnaup_path, prefix,
                                         "_".join([prefix, "RNAup_rank.csv"]))
                self._remove_repeat(rnaup_file, "RNAup")
            if ("IntaRNA" in args_tar.program):
                intarna_file = os.path.join(self.intarna_path, prefix,
                                            "_".join([prefix, "IntaRNA.txt"]))
                out_intarna = os.path.join(self.intarna_path, prefix,
                                           "_".join([prefix, "IntaRNA_rank.csv"]))
                self._remove_repeat(intarna_file, "IntaRNA")
            overlap_file = os.path.join(self.merge_path, prefix,
                                        "_".join([prefix, "overlap.csv"]))
            merge_file = os.path.join(self.merge_path, prefix,
                                      "_".join([prefix, "merge.csv"]))
            merge_srna_target(rnaplex_file, rnaup_file, intarna_file, args_tar,
                              out_rnaplex, out_rnaup, out_intarna,
                              os.path.join(self.fasta_path, prefix + ".fa"),
                              merge_file, overlap_file,
                              os.path.join(self.srna_path,
                                           "_".join([prefix, "sRNA.gff"])),
                              os.path.join(self.gff_path, prefix + ".gff"))
            if ("RNAplex" in args_tar.program):
                log.write("\t" + out_rnaplex + "\n")
            if ("RNAup" in args_tar.program):
                log.write("\t" + out_rnaup + "\n")
            if ("IntaRNA" in args_tar.program):
                log.write("\t" + out_intarna + "\n")
            if (os.path.exists(merge_file)):
                log.write("\t" + merge_file + "\n")
            if (os.path.exists(overlap_file)):
                log.write("\t" + overlap_file + "\n")

    def _remove_rnaplex(self, line, num, pre_num, pre, checks,
                        out_tmp, print_):
        if (line.startswith(">")):
            if (num % 2 == 1):
                print_ = False
                pre = line
                if (line not in checks):
                    checks[line] = []
                    print_ = True
            elif (num % 2 == 0) and (line not in checks[pre]):
                checks[pre].append(line)
                print_ = True
            num = num + 1
        else:
            if (print_):
                if (num != pre_num):
                    out_tmp.write(pre + "\n")
                    out_tmp.write(checks[pre][-1] + "\n")
                out_tmp.write(line + "\n")
                pre_num = num
        return num, pre_num, print_, pre,

    def _remove_rnaup(self, line, pre, num, pre_num, srna_info,
                      checks, out_tmp, print_, tar):
        if (line.startswith(">")):
            print_ = False
            tar = False
            if (pre.startswith(">")):
                if (pre not in checks):
                    checks[pre] = [line]
                    srna_info = pre
                    print_ = True
                else:
                    if (line not in checks[pre]):
                        checks[pre].append(line)
                        print_ = True
            else:
                if (num != 1):
                    if (line not in checks[srna_info]):
                        checks[srna_info].append(line)
                        print_ = True
        else:
            if (print_):
                if (pre_num != len(checks)):
                    out_tmp.write(srna_info + "\n")
                    out_tmp.write(checks[srna_info][-1] + "\n")
                    out_tmp.write(line + "\n")
                else:
                    if (not tar):
                        out_tmp.write(checks[srna_info][-1] + "\n")
                    out_tmp.write(line + "\n")
                pre_num = len(checks)
                tar = True
        pre = line
        num = num + 1
        return num, pre_num, print_, pre, tar, srna_info

    def _remove_intarna(self, line, checks, tar, srna_info, seq, out_tmp):
        if (line.startswith(".")) or (
                line.startswith("(")) or (
                line.startswith(")")):
            seq = line.split(";")[0]
            if (seq not in checks[tar][srna_info]):
                checks[tar][srna_info].append(seq)
                out_tmp.write(line + "\n")
        else:
            if (len(line.split(";")) >= 8):
                tar = line.split(";")[0]
                srna_info = line.split(";")[3]
                seq = line.split(";")[7]
                if (tar not in checks):
                    checks[tar] = {}
                    checks[tar][srna_info] = [seq]
                    out_tmp.write(line + "\n")
                else:
                    if (srna_info not in checks[tar]):
                        checks[tar][srna_info] = [seq]
                        out_tmp.write(line + "\n")
        return tar, srna_info, seq

    def _remove_repeat(self, interact_file, type_):
        checks = {}
        seq = ""
        pre = ""
        srna_info = ""
        num = 1
        tar = False
        pre_num = 0
        print_ = False
        out_tmp = open(interact_file + "tmp", "w")
        with open(interact_file) as fh:
            for line in fh:
                line = line.strip()
                if (type_ == "RNAplex"):
                    num, pre_num, print_, pre = self._remove_rnaplex(
                            line, num, pre_num, pre, checks, out_tmp, print_)
                elif (type_ == "RNAup"):
                    num, pre_num, print_, pre, tar, srna_info = (
                            self._remove_rnaup(
                                line, pre, num, pre_num,
                                srna_info, checks, out_tmp, print_, tar))
                elif (type_ == "IntaRNA"):
                    tar, srna_info, seq = self._remove_intarna(
                            line, checks, tar, srna_info, seq, out_tmp)
        out_tmp.close()
        shutil.move(interact_file + "tmp", interact_file)


    def run_srna_target_prediction(self, args_tar, log):
        self._check_gff(args_tar.gffs)
        self._check_gff(args_tar.srnas)
        self.multiparser.parser_gff(args_tar.gffs, None)
        self.multiparser.parser_fasta(args_tar.fastas)
        self.multiparser.parser_gff(args_tar.srnas, "sRNA")
        prefixs = []
        self._gen_seq(prefixs, args_tar)
        if ("RNAplex" in args_tar.program):
            self._rna_plex(prefixs, args_tar, log)
        self.helper.remove_all_content(self.target_seq_path,
                                       "_target_", "file")
        log.write("The temporary files for running RNAplex are deleted.\n")
        if ("RNAup" in args_tar.program):
            self._rnaup(prefixs, args_tar, log)
        if ("IntaRNA" in args_tar.program):
            self._intarna(prefixs, args_tar, log)
        self._merge_rnaplex_rnaup(prefixs, args_tar, log)
        self.helper.remove_all_content(args_tar.out_folder,
                                       self.tmps["tmp"], "dir")
        self.helper.remove_all_content(args_tar.out_folder,
                                       self.tmps["tmp"], "file")
        self.helper.remove_tmp_dir(args_tar.gffs)
        self.helper.remove_tmp_dir(args_tar.srnas)
        self.helper.remove_tmp_dir(args_tar.fastas)
        self.helper.remove_all_content(self.srna_seq_path, "tmp_", "file")
示例#13
0
class sRNATargetPrediction(object):
    '''detection of sRNA-target interaction'''
    def __init__(self, args_tar):
        self.multiparser = Multiparser()
        self.helper = Helper()
        self.fixer = FormatFixer()
        self.gff_parser = Gff3Parser()
        self.target_seq_path = os.path.join(args_tar.out_folder, "target_seqs")
        self.srna_seq_path = os.path.join(args_tar.out_folder, "sRNA_seqs")
        self.rnaplex_path = os.path.join(args_tar.out_folder,
                                         "RNAplex_results")
        self.rnaup_path = os.path.join(args_tar.out_folder, "RNAup_results")
        self.merge_path = os.path.join(args_tar.out_folder, "merged_results")
        self.srna_path = os.path.join(args_tar.srnas, "tmp")
        self.fasta_path = os.path.join(args_tar.fastas, "tmp")
        self.gff_path = os.path.join(args_tar.gffs, "tmp")
        self.tmps = {
            "tmp": "tmp_srna_target",
            "rnaup": "tmp_rnaup",
            "log": "tmp_log",
            "all_fa": "tmp*.fa",
            "all_txt": "tmp*.txt"
        }

    def _check_gff(self, gffs):
        for gff in os.listdir(gffs):
            if gff.endswith(".gff"):
                self.helper.check_uni_attributes(os.path.join(gffs, gff))

    def _run_rnaplfold(self, rnaplfold_path, file_type, win_size, span,
                       unstr_region, seq_path, prefix, out_path):
        current = os.getcwd()
        os.chdir(out_path)
        command = " ".join([
            rnaplfold_path, "-W",
            str(win_size), "-L",
            str(span), "-u",
            str(unstr_region), "-O"
        ])
        if file_type == "sRNA":
            os.system("<".join([
                command,
                os.path.join(
                    current, seq_path,
                    "_".join([self.tmps["tmp"], prefix, file_type + ".fa"]))
            ]))
        else:
            os.system("<".join([
                command,
                os.path.join(current, seq_path,
                             "_".join([prefix, file_type + ".fa"]))
            ]))
        os.chdir(current)

    def _wait_process(self, processes):
        for p in processes:
            p.wait()
            if p.stdout:
                p.stdout.close()
            if p.stdin:
                p.stdin.close()
            if p.stderr:
                p.stderr.close()
            try:
                p.kill()
            except OSError:
                pass
            time.sleep(5)

    def _sort_srna_fasta(self, fasta, prefix, path):
        out = open(
            os.path.join(path, "_".join([self.tmps["tmp"], prefix,
                                         "sRNA.fa"])), "w")
        srnas = []
        with open(fasta) as f_h:
            for line in f_h:
                line = line.strip()
                if line.startswith(">"):
                    name = line[1:]
                else:
                    srnas.append({"name": name, "seq": line, "len": len(line)})
        srnas = sorted(srnas, key=lambda x: (x["len"]))
        for srna in srnas:
            out.write(">" + srna["name"].split("|")[0] + "\n")
            out.write(srna["seq"] + "\n")
        out.close()

    def _read_fasta(self, fasta_file):
        seq = ""
        with open(fasta_file, "r") as seq_f:
            for line in seq_f:
                line = line.strip()
                if line.startswith(">"):
                    continue
                else:
                    seq = seq + line
        return seq

    def _get_specific_seq(self, srna_file, seq_file, srna_out, querys):
        for query in querys:
            srna_datas = query.split(":")
            srna = {
                "seq_id": srna_datas[0],
                "strand": srna_datas[3],
                "start": int(srna_datas[1]),
                "end": int(srna_datas[2])
            }
            gff_f = open(srna_file, "r")
            out = open(srna_out, "a")
            seq = self._read_fasta(seq_file)
            num = 0
            detect = False
            for entry in self.gff_parser.entries(gff_f):
                if (entry.seq_id == srna["seq_id"]) and (
                        entry.strand == srna["strand"]) and (
                            entry.start == srna["start"]) and (entry.end
                                                               == srna["end"]):
                    detect = True
                    if "ID" in entry.attributes.keys():
                        id_ = entry.attributes["ID"]
                    else:
                        id_ = entry.feature + str(num)
                    gene = self.helper.extract_gene(seq, entry.start,
                                                    entry.end, entry.strand)
                    out.write(">{0}|{1}|{2}|{3}|{4}\n{5}\n".format(
                        id_, entry.seq_id, entry.start, entry.end,
                        entry.strand, gene))
                    num += 1
            if not detect:
                print("Error: Some of the query sRNAs do not exist!")
                sys.exit()
            gff_f.close()
            out.close()

    def _gen_seq(self, prefixs, args_tar):
        print("Generating sRNA fasta files")
        for srna in os.listdir(self.srna_path):
            if srna.endswith("_sRNA.gff"):
                prefix = srna.replace("_sRNA.gff", "")
                prefixs.append(prefix)
                srna_out = os.path.join(self.srna_seq_path,
                                        "_".join([prefix, "sRNA.fa"]))
                if "all" in args_tar.query:
                    self.helper.get_seq(
                        os.path.join(self.srna_path, srna),
                        os.path.join(self.fasta_path, prefix + ".fa"),
                        srna_out)
                else:
                    if "_".join([prefix,
                                 "sRNA.fa"]) in os.listdir(self.srna_seq_path):
                        os.remove(srna_out)
                    self._get_specific_seq(
                        os.path.join(self.srna_path, srna),
                        os.path.join(self.fasta_path, prefix + ".fa"),
                        srna_out, args_tar.query)
                self._sort_srna_fasta(srna_out, prefix, self.srna_seq_path)
        print("Generating target fasta files")
        for gff in os.listdir(self.gff_path):
            if gff.endswith(".gff"):
                prefix = gff.replace(".gff", "")
                potential_target(os.path.join(self.gff_path, gff),
                                 os.path.join(self.fasta_path, prefix + ".fa"),
                                 os.path.join(self.target_seq_path), args_tar)
                file_num = 1
                num = 0
                sub_prefix = os.path.join(self.target_seq_path,
                                          "_".join([prefix, "target"]))
                sub_out = open("_".join([sub_prefix,
                                         str(file_num) + ".fa"]), "w")
                with open((sub_prefix + ".fa"), "r") as t_f:
                    for line in t_f:
                        line = line.strip()
                        if line.startswith(">"):
                            num += 1
                        if (num == 100):
                            num = 0
                            file_num += 1
                            sub_out.close()
                            sub_out = open(
                                "_".join([sub_prefix,
                                          str(file_num) + ".fa"]), "w")
                        sub_out.write(line + "\n")
                sub_out.close()

    def _run_rnaplex(self, prefix, rnaplfold_folder, args_tar):
        print("Running RNAplex of {0}".format(prefix))
        num_process = 0
        processes = []
        for seq in os.listdir(self.target_seq_path):
            if (prefix in seq) and ("_target_" in seq):
                print("Running RNAplex with {0}".format(seq))
                out_rnaplex = open(
                    os.path.join(
                        self.rnaplex_path, prefix, "_".join(
                            [prefix, "RNAplex",
                             str(num_process) + ".txt"])), "w")
                num_process += 1
                p = Popen([
                    args_tar.rnaplex_path, "-q",
                    os.path.join(
                        self.srna_seq_path, "_".join(
                            [self.tmps["tmp"], prefix, "sRNA.fa"])), "-t",
                    os.path.join(self.target_seq_path, seq), "-l",
                    str(args_tar.inter_length), "-e",
                    str(args_tar.energy), "-z",
                    str(args_tar.duplex_dist), "-a", rnaplfold_folder
                ],
                          stdout=out_rnaplex)
                processes.append(p)
                if num_process % args_tar.core_plex == 0:
                    self._wait_process(processes)
        self._wait_process(processes)
        return num_process

    def _rna_plex(self, prefixs, args_tar):
        for prefix in prefixs:
            print("Running RNAplfold of {0}".format(prefix))
            self.helper.check_make_folder(
                os.path.join(self.rnaplex_path, prefix))
            rnaplfold_folder = os.path.join(self.rnaplex_path, prefix,
                                            "RNAplfold")
            os.mkdir(rnaplfold_folder)
            self._run_rnaplfold(args_tar.rnaplfold_path, "sRNA",
                                args_tar.win_size_s, args_tar.span_s,
                                args_tar.unstr_region_rnaplex_s,
                                self.srna_seq_path, prefix, rnaplfold_folder)
            self._run_rnaplfold(args_tar.rnaplfold_path, "target",
                                args_tar.win_size_t, args_tar.span_t,
                                args_tar.unstr_region_rnaplex_t,
                                self.target_seq_path, prefix, rnaplfold_folder)
            num_process = self._run_rnaplex(prefix, rnaplfold_folder, args_tar)
            rnaplex_file = os.path.join(self.rnaplex_path, prefix,
                                        "_".join([prefix, "RNAplex.txt"]))
            if ("_".join([prefix, "RNAplex.txt"])
                    in os.listdir(os.path.join(self.rnaplex_path, prefix))):
                os.remove(rnaplex_file)
            for index in range(0, num_process):
                self.helper.merge_file(
                    os.path.join(
                        self.rnaplex_path, prefix,
                        "_".join([prefix, "RNAplex",
                                  str(index) + ".txt"])), rnaplex_file)
            self.helper.remove_all_content(
                os.path.join(self.rnaplex_path, prefix), "_RNAplex_", "file")
            self.fixer.fix_rnaplex(rnaplex_file, self.tmps["tmp"])
            shutil.move(self.tmps["tmp"], rnaplex_file)
            shutil.rmtree(rnaplfold_folder)

    def _run_rnaup(self, num_up, processes, out_rnaup, out_log, args_tar):
        for index in range(1, num_up + 1):
            out_tmp_up = open(
                os.path.join(args_tar.out_folder,
                             "".join([self.tmps["rnaup"],
                                      str(index), ".txt"])), "w")
            out_err = open(
                os.path.join(args_tar.out_folder,
                             "".join([self.tmps["log"],
                                      str(index), ".txt"])), "w")
            in_up = open(
                os.path.join(args_tar.out_folder,
                             "".join([self.tmps["tmp"],
                                      str(index), ".fa"])), "r")
            p = Popen([
                args_tar.rnaup_path, "-u",
                str(args_tar.unstr_region_rnaup), "-o", "--interaction_first"
            ],
                      stdin=in_up,
                      stdout=out_tmp_up,
                      stderr=out_err)
            processes.append(p)
        if len(processes) != 0:
            time.sleep(5)
            self._wait_process(processes)
            os.system("rm " +
                      os.path.join(args_tar.out_folder, self.tmps["all_fa"]))
            self._merge_txt(num_up, out_rnaup, out_log, args_tar.out_folder)
            os.system("rm " +
                      os.path.join(args_tar.out_folder, self.tmps["all_txt"]))

    def _merge_txt(self, num_up, out_rnaup, out_log, out_folder):
        for index in range(1, num_up + 1):
            self.helper.merge_file(
                os.path.join(out_folder,
                             "".join([self.tmps["rnaup"],
                                      str(index), ".txt"])), out_rnaup)
            self.helper.merge_file(
                os.path.join(out_folder,
                             "".join([self.tmps["log"],
                                      str(index), ".txt"])), out_log)

    def _get_continue(self, out_rnaup):
        '''For RNAup, it can continue running RNAup based on previous run'''
        srnas = []
        matchs = {}
        out = open("tmp.txt", "w")
        with open(out_rnaup) as f_h:
            for line in f_h:
                line = line.strip()
                if ">srna" in line:
                    srna = line[1:]
                    srnas.append(srna)
                    matchs[srna] = []
                else:
                    matchs[srna].append(line)
        srnas = srnas[:-1]
        for srna in srnas:
            out.write(">" + srna + "\n")
            for target in matchs[srna]:
                out.write(target + "\n")
        out.close()
        os.remove(out_rnaup)
        shutil.move("tmp.txt", out_rnaup)
        return srnas

    def _rnaup(self, prefixs, args_tar):
        for prefix in prefixs:
            srnas = []
            print("Running RNAup of {0}".format(prefix))
            if not os.path.exists(os.path.join(self.rnaup_path, prefix)):
                os.mkdir(os.path.join(self.rnaup_path, prefix))
            num_up = 0
            processes = []
            out_rnaup = os.path.join(self.rnaup_path, prefix,
                                     "_".join([prefix + "_RNAup.txt"]))
            out_log = os.path.join(self.rnaup_path, prefix,
                                   "_".join([prefix + "_RNAup.log"]))
            if "_".join([prefix, "RNAup.txt"]) in \
                    os.listdir(os.path.join(self.rnaup_path, prefix)):
                if not args_tar.continue_rnaup:
                    os.remove(out_rnaup)
                    os.remove(out_log)
                else:
                    srnas = self._get_continue(out_rnaup)
            with open(
                    os.path.join(
                        self.srna_seq_path,
                        "_".join([self.tmps["tmp"], prefix, "sRNA.fa"])),
                    "r") as s_f:
                for line in s_f:
                    line = line.strip()
                    if line.startswith(">"):
                        if line[1:] in srnas:
                            start = False
                            continue
                        start = True
                        print("Running RNAup with {0}".format(line[1:]))
                        num_up += 1
                        out_up = open(
                            os.path.join(
                                args_tar.out_folder,
                                "".join([self.tmps["tmp"],
                                         str(num_up), ".fa"])), "w")
                        out_up.write(line + "\n")
                    else:
                        if start:
                            out_up.write(line + "\n")
                            out_up.close()
                            self.helper.merge_file(
                                os.path.join(self.target_seq_path,
                                             "_".join([prefix, "target.fa"])),
                                os.path.join(
                                    args_tar.out_folder, "".join(
                                        [self.tmps["tmp"],
                                         str(num_up), ".fa"])))
                            if num_up == args_tar.core_up:
                                self._run_rnaup(num_up, processes, out_rnaup,
                                                out_log, args_tar)
                                processes = []
                                num_up = 0
            self._run_rnaup(num_up, processes, out_rnaup, out_log, args_tar)

    def _merge_rnaplex_rnaup(self, prefixs, args_tar):
        '''merge the result of RNAup and RNAplex'''
        for prefix in prefixs:
            rnaplex_file = None
            rnaup_file = None
            out_rnaplex = None
            out_rnaup = None
            self.helper.check_make_folder(os.path.join(self.merge_path,
                                                       prefix))
            print("Ranking {0} now".format(prefix))
            if (args_tar.program == "both") or (args_tar.program == "RNAplex"):
                rnaplex_file = os.path.join(self.rnaplex_path, prefix,
                                            "_".join([prefix, "RNAplex.txt"]))
                out_rnaplex = os.path.join(
                    self.rnaplex_path, prefix,
                    "_".join([prefix, "RNAplex_rank.csv"]))
            if (args_tar.program == "both") or (args_tar.program == "RNAup"):
                rnaup_file = os.path.join(self.rnaup_path, prefix,
                                          "_".join([prefix, "RNAup.txt"]))
                out_rnaup = os.path.join(self.rnaup_path, prefix,
                                         "_".join([prefix, "RNAup_rank.csv"]))
            merge_srna_target(
                rnaplex_file, rnaup_file, args_tar, out_rnaplex, out_rnaup,
                os.path.join(self.fasta_path, prefix + ".fa"),
                os.path.join(self.merge_path, prefix,
                             "_".join([prefix, "merge.csv"])),
                os.path.join(self.merge_path, prefix,
                             "_".join([prefix, "overlap.csv"])),
                os.path.join(self.srna_path, "_".join([prefix, "sRNA.gff"])),
                os.path.join(self.gff_path, prefix + ".gff"))

    def run_srna_target_prediction(self, args_tar):
        self._check_gff(args_tar.gffs)
        self._check_gff(args_tar.srnas)
        self.multiparser.parser_gff(args_tar.gffs, None)
        self.multiparser.parser_fasta(args_tar.fastas)
        self.multiparser.parser_gff(args_tar.srnas, "sRNA")
        prefixs = []
        self._gen_seq(prefixs, args_tar)
        if (args_tar.program == "both") or (args_tar.program == "RNAplex"):
            self._rna_plex(prefixs, args_tar)
        self.helper.remove_all_content(self.target_seq_path, "_target_",
                                       "file")
        #        if (args_tar.program == "RNAplex") or (
        #                args_tar.program == "both"):
        #            for strain in os.listdir(os.path.join(
        #                          args_tar.out_folder, "RNAplex_results")):
        #                shutil.rmtree(os.path.join(args_tar.out_folder, "RNAplex_results",
        #                                           strain, "RNAplfold"))
        if (args_tar.program == "both") or (args_tar.program == "RNAup"):
            self._rnaup(prefixs, args_tar)
        self._merge_rnaplex_rnaup(prefixs, args_tar)
        self.helper.remove_all_content(args_tar.out_folder, self.tmps["tmp"],
                                       "dir")
        self.helper.remove_all_content(args_tar.out_folder, self.tmps["tmp"],
                                       "file")
        self.helper.remove_tmp_dir(args_tar.gffs)
        self.helper.remove_tmp_dir(args_tar.srnas)
        self.helper.remove_tmp_dir(args_tar.fastas)
        self.helper.remove_all_content(self.srna_seq_path, "tmp_", "file")
示例#14
0
class sRNATargetPrediction(object):
    '''detection of sRNA-target interaction'''
    def __init__(self, args_tar):
        self.multiparser = Multiparser()
        self.helper = Helper()
        self.fixer = FormatFixer()
        self.gff_parser = Gff3Parser()
        self.target_seq_path = os.path.join(args_tar.out_folder, "target_seqs")
        self.srna_seq_path = os.path.join(args_tar.out_folder, "sRNA_seqs")
        self.rnaplex_path = os.path.join(args_tar.out_folder,
                                         "RNAplex_results")
        self.rnaup_path = os.path.join(args_tar.out_folder, "RNAup_results")
        self.intarna_path = os.path.join(args_tar.out_folder,
                                         "IntaRNA_results")
        self.merge_path = os.path.join(args_tar.out_folder, "merged_results")
        self.srna_path = os.path.join(args_tar.srnas, "tmp")
        self.fasta_path = os.path.join(args_tar.fastas, "tmp")
        self.gff_path = os.path.join(args_tar.gffs, "tmp")
        self.tmps = {
            "tmp": "tmp_srna_target",
            "rnaup": "tmp_rnaup",
            "log": "tmp_log",
            "all_fa": "tmp*.fa",
            "all_txt": "tmp*.txt"
        }

    def _check_gff(self, gffs):
        for gff in os.listdir(gffs):
            if gff.endswith(".gff"):
                self.helper.check_uni_attributes(os.path.join(gffs, gff))

    def _check_long_id(self, seq_file, long_ids, type_):
        out_file = seq_file + "_tmp.fa"
        out = open(out_file, "w")
        with open(seq_file) as f_h:
            for line in f_h:
                line = line.strip()
                if line.startswith(">"):
                    if len(line) > 40:
                        long_ids[type_].append(line[1:])
                        out.write(">TMP" + type_ + "_" +
                                  str(len(long_ids[type_])) + "\n")
                    else:
                        out.write(line + "\n")
                else:
                    out.write(line + "\n")
        out.close()
        return out_file

    def _run_rnaplfold(self, rnaplfold_path, file_type, win_size, span,
                       unstr_region, long_ids, seq_path, prefix, out_path,
                       log):
        current = os.getcwd()
        os.chdir(out_path)
        command = " ".join([
            rnaplfold_path, "-W",
            str(win_size), "-L",
            str(span), "-u",
            str(unstr_region), "-O"
        ])
        if file_type == "sRNA":
            srna_seq_file = os.path.join(
                current, seq_path,
                "_".join([self.tmps["tmp"], prefix, file_type + ".fa"]))
            out_file = self._check_long_id(srna_seq_file, long_ids, "srna")
            log.write("<".join([command, out_file]) + "\n")
            os.system("<".join([command, out_file]))
        else:
            tar_seq_file = os.path.join(current, seq_path,
                                        "_".join([prefix, file_type + ".fa"]))
            for tar_seq_file in os.listdir(os.path.join(current, seq_path)):
                if (prefix + "_" + file_type + "_") in tar_seq_file:
                    out_file = self._check_long_id(
                        os.path.join(current, seq_path, tar_seq_file),
                        long_ids, "tar")
                    log.write("<".join([command, out_file]) + "\n")
                    os.system("<".join([command, out_file]))
        os.chdir(current)

    def _wait_process(self, processes):
        for p in processes:
            p.wait()
            if p.stdout:
                p.stdout.close()
            if p.stdin:
                p.stdin.close()
            if p.stderr:
                p.stderr.close()
            try:
                p.kill()
            except OSError:
                pass
            time.sleep(5)

    def _sort_srna_fasta(self, fasta, prefix, path):
        out = open(
            os.path.join(path, "_".join([self.tmps["tmp"], prefix,
                                         "sRNA.fa"])), "w")
        srnas = []
        with open(fasta) as f_h:
            for line in f_h:
                line = line.strip()
                if line.startswith(">"):
                    name = line[1:]
                else:
                    srnas.append({"name": name, "seq": line, "len": len(line)})
        srnas = sorted(srnas, key=lambda x: (x["len"]))
        for srna in srnas:
            out.write(">" + srna["name"].split("|")[0] + "\n")
            out.write(srna["seq"] + "\n")
        out.close()

    def _read_fasta(self, fasta_file):
        seq = ""
        with open(fasta_file, "r") as seq_f:
            for line in seq_f:
                line = line.strip()
                if line.startswith(">"):
                    continue
                else:
                    seq = seq + line
        return seq

    def _get_specific_seq(self, srna_file, seq_file, srna_out, querys):
        for query in querys:
            srna_datas = query.split(":")
            srna = {
                "seq_id": srna_datas[0],
                "strand": srna_datas[3],
                "start": int(srna_datas[1]),
                "end": int(srna_datas[2])
            }
            gff_f = open(srna_file, "r")
            out = open(srna_out, "a")
            seq = self._read_fasta(seq_file)
            num = 0
            detect = False
            for entry in self.gff_parser.entries(gff_f):
                if (entry.seq_id == srna["seq_id"]) and (
                        entry.strand == srna["strand"]) and (
                            entry.start == srna["start"]) and (entry.end
                                                               == srna["end"]):
                    detect = True
                    if "ID" in entry.attributes.keys():
                        id_ = entry.attributes["ID"]
                    else:
                        id_ = entry.feature + str(num)
                    gene = self.helper.extract_gene(seq, entry.start,
                                                    entry.end, entry.strand)
                    out.write(">{0}|{1}|{2}|{3}|{4}\n{5}\n".format(
                        id_, entry.seq_id, entry.start, entry.end,
                        entry.strand, gene))
                    num += 1
            if not detect:
                print("Error: Some of the query sRNAs do not exist!")
                sys.exit()
            gff_f.close()
            out.close()

    def _gen_seq(self, prefixs, target_prefixs, args_tar):
        print("Generating sRNA fasta files")
        for gff in os.listdir(self.gff_path):
            if gff.endswith(".gff"):
                prefix = gff.replace(".gff", "")
                target_prefixs.append(prefix)
        detect = False
        for gff in os.listdir(self.gff_path):
            if gff.endswith(".gff"):
                prefix = gff.replace(".gff", "")
                potential_target(os.path.join(self.gff_path, gff),
                                 os.path.join(self.fasta_path, prefix + ".fa"),
                                 os.path.join(self.target_seq_path), args_tar,
                                 target_prefixs)
                file_num = 1
                num = 0
                sub_prefix = os.path.join(self.target_seq_path,
                                          "_".join([prefix, "target"]))
                if os.path.exists(sub_prefix + ".fa"):
                    sub_out = open(
                        "_".join([sub_prefix,
                                  str(file_num) + ".fa"]), "w")
                    with open((sub_prefix + ".fa"), "r") as t_f:
                        for line in t_f:
                            line = line.strip()
                            if line.startswith(">"):
                                #                                line = line.replace("|", "_")
                                num += 1
                            if (num == 100):
                                num = 0
                                file_num += 1
                                sub_out.close()
                                sub_out = open(
                                    "_".join(
                                        [sub_prefix,
                                         str(file_num) + ".fa"]), "w")
                            detect = True
                            sub_out.write(line + "\n")
                    sub_out.close()
                else:
                    open(sub_prefix + ".fa", "w").close()
        if not detect:
            print("No assigned features can be found. "
                  "Please check your genome annotation. "
                  "And assign correct features to --target_feature.")
            sys.exit()
        print("Generating sRNA fasta files")
        for srna in os.listdir(self.srna_path):
            if srna.endswith("_sRNA.gff"):
                prefix = srna.replace("_sRNA.gff", "")
                prefixs.append(prefix)
                srna_out = os.path.join(self.srna_seq_path,
                                        "_".join([prefix, "sRNA.fa"]))
                if "all" in args_tar.query:
                    self.helper.get_seq(
                        os.path.join(self.srna_path, srna),
                        os.path.join(self.fasta_path, prefix + ".fa"),
                        srna_out)
                else:
                    if "_".join([prefix,
                                 "sRNA.fa"]) in os.listdir(self.srna_seq_path):
                        os.remove(srna_out)
                    self._get_specific_seq(
                        os.path.join(self.srna_path, srna),
                        os.path.join(self.fasta_path, prefix + ".fa"),
                        srna_out, args_tar.query)
                self._sort_srna_fasta(srna_out, prefix, self.srna_seq_path)

    def _run_rnaplex(self, prefix, rnaplfold_folder, args_tar, log):
        print("Running RNAplex of {0}".format(prefix))
        num_process = 0
        processes = []
        for seq in os.listdir(self.target_seq_path):
            if ("_target_" in seq) and (".fa_tmp.fa" in seq):
                print("Running RNAplex with {0}".format(
                    seq.replace(".fa_tmp.fa", "")))
                out_rnaplex = open(
                    os.path.join(
                        self.rnaplex_path, prefix, "_".join(
                            [prefix, "RNAplex",
                             str(num_process) + ".txt"])), "w")
                num_process += 1
                log.write(" ".join([
                    args_tar.rnaplex_path, "-q",
                    os.path.join(
                        self.srna_seq_path, "_".join([
                            self.tmps["tmp"], prefix, "sRNA.fa_tmp.fa"
                        ])), "-t",
                    os.path.join(self.target_seq_path, seq), "-l",
                    str(args_tar.inter_length), "-e",
                    str(args_tar.energy), "-z",
                    str(args_tar.duplex_dist), "-a", rnaplfold_folder
                ]) + "\n")
                p = Popen([
                    args_tar.rnaplex_path, "-q",
                    os.path.join(
                        self.srna_seq_path, "_".join([
                            self.tmps["tmp"], prefix, "sRNA.fa_tmp.fa"
                        ])), "-t",
                    os.path.join(self.target_seq_path, seq), "-l",
                    str(args_tar.inter_length), "-e",
                    str(args_tar.energy), "-z",
                    str(args_tar.duplex_dist), "-a", rnaplfold_folder
                ],
                          stdout=out_rnaplex)
                processes.append(p)
                if num_process % args_tar.core_plex == 0:
                    self._wait_process(processes)
        self._wait_process(processes)
        log.write("The prediction for {0} is done.\n".format(prefix))
        log.write(
            "The following temporary files for storing results of {0} are "
            "generated:\n".format(prefix))
        for file_ in os.listdir(os.path.join(self.rnaplex_path, prefix)):
            log.write("\t" + os.path.join(self.rnaplex_path, prefix, file_) +
                      "\n")
        return num_process

    def _restore_long_ids(self, rnaplex_file, long_ids):
        out = open(rnaplex_file + "tmp", "w")
        with open(rnaplex_file, "r") as t_f:
            for line in t_f:
                line = line.strip()
                if (line.startswith(">")):
                    if (line.startswith(">TMPtar_")):
                        header = long_ids["tar"][int(line.split("_")[1]) - 1]
                    elif (line.startswith(">TMPsrna_")):
                        header = long_ids["srna"][int(line.split("_")[1]) - 1]
                    else:
                        header = line[1:]
                    out.write(">" + header + "\n")
                else:
                    out.write(line + "\n")
        out.close()
        shutil.move(rnaplex_file + "tmp", rnaplex_file)

    def _rna_plex(self, prefixs, target_prefixs, args_tar, log):
        log.write("Using RNAplex and RNAplfold to predict sRNA targets.\n")
        log.write("Please make sure the version of Vienna RNA package is "
                  "at least 2.3.2.\n")
        tmp_rnaplfold_folder = os.path.join(self.rnaplex_path, "tmp_RNAplfold")
        if os.path.exists(tmp_rnaplfold_folder):
            shutil.rmtree(tmp_rnaplfold_folder)
        os.mkdir(tmp_rnaplfold_folder)
        long_ids = {"tar": [], "srna": []}
        for prefix in target_prefixs:
            self._run_rnaplfold(args_tar.rnaplfold_path, "target",
                                args_tar.win_size_t, args_tar.span_t,
                                args_tar.unstr_region_rnaplex_t, long_ids,
                                self.target_seq_path, prefix,
                                tmp_rnaplfold_folder, log)
        for prefix in prefixs:
            print("Running RNAplfold of {0}".format(prefix))
            self.helper.check_make_folder(
                os.path.join(self.rnaplex_path, prefix))
            rnaplfold_folder = os.path.join(self.rnaplex_path, prefix,
                                            "RNAplfold")
            shutil.copytree(tmp_rnaplfold_folder, rnaplfold_folder)
            self._run_rnaplfold(args_tar.rnaplfold_path, "sRNA",
                                args_tar.win_size_s, args_tar.span_s,
                                args_tar.unstr_region_rnaplex_s, long_ids,
                                self.srna_seq_path, prefix, rnaplfold_folder,
                                log)
            num_process = self._run_rnaplex(prefix, rnaplfold_folder, args_tar,
                                            log)
            rnaplex_file = os.path.join(self.rnaplex_path, prefix,
                                        "_".join([prefix, "RNAplex.txt"]))
            if ("_".join([prefix, "RNAplex.txt"])
                    in os.listdir(os.path.join(self.rnaplex_path, prefix))):
                os.remove(rnaplex_file)
            for index in range(0, num_process):
                log.write("Using helper.py to merge the temporary files.\n")
                self.helper.merge_file(
                    os.path.join(
                        self.rnaplex_path, prefix,
                        "_".join([prefix, "RNAplex",
                                  str(index) + ".txt"])), rnaplex_file)
            if (len(long_ids["tar"]) != 0) or (len(long_ids["srna"]) != 0):
                self._restore_long_ids(rnaplex_file, long_ids)
            log.write("\t" + rnaplex_file + " is generated.\n")
            self.helper.remove_all_content(
                os.path.join(self.rnaplex_path, prefix), "_RNAplex_", "file")
            self.fixer.fix_rnaplex(rnaplex_file, self.tmps["tmp"])
            shutil.move(self.tmps["tmp"], rnaplex_file)
            shutil.rmtree(rnaplfold_folder)

    def _run_rnaup(self, num_up, processes, prefix, out_rnaup, out_log,
                   args_tar, log):
        for index in range(1, num_up + 1):
            out_tmp_up = open(
                os.path.join(args_tar.out_folder,
                             "".join([self.tmps["rnaup"],
                                      str(index), ".txt"])), "w")
            out_err = open(
                os.path.join(args_tar.out_folder,
                             "".join([self.tmps["log"],
                                      str(index), ".txt"])), "w")
            in_up = open(
                os.path.join(args_tar.out_folder,
                             "".join([self.tmps["tmp"],
                                      str(index), ".fa"])), "r")
            log.write(" ".join([
                args_tar.rnaup_path, "-u",
                str(args_tar.unstr_region_rnaup), "-o", "--interaction_first"
            ]) + "\n")
            p = Popen([
                args_tar.rnaup_path, "-u",
                str(args_tar.unstr_region_rnaup), "-o", "--interaction_first"
            ],
                      stdin=in_up,
                      stdout=out_tmp_up,
                      stderr=out_err)
            processes.append(p)
        if len(processes) != 0:
            time.sleep(5)
            self._wait_process(processes)
            log.write(
                "The following temporary files for storing results of {0} are "
                "generated:\n".format(prefix))
            for file_ in os.listdir(os.path.join(args_tar.out_folder)):
                log.write("\t" + os.path.join(args_tar.out_folder, file_) +
                          "\n")
            os.system("rm " +
                      os.path.join(args_tar.out_folder, self.tmps["all_fa"]))
            self._merge_txt(num_up, out_rnaup, out_log, args_tar.out_folder)
            os.system("rm " +
                      os.path.join(args_tar.out_folder, self.tmps["all_txt"]))

    def _merge_txt(self, num_up, out_rnaup, out_log, out_folder):
        for index in range(1, num_up + 1):
            self.helper.merge_file(
                os.path.join(out_folder,
                             "".join([self.tmps["rnaup"],
                                      str(index), ".txt"])), out_rnaup)
            self.helper.merge_file(
                os.path.join(out_folder,
                             "".join([self.tmps["log"],
                                      str(index), ".txt"])), out_log)

    def _get_continue(self, out_rnaup):
        '''For RNAup, it can continue running RNAup based on previous run'''
        srnas = []
        matchs = {}
        out = open("tmp.txt", "w")
        with open(out_rnaup) as f_h:
            for line in f_h:
                line = line.strip()
                if ">srna" in line:
                    srna = line[1:]
                    srnas.append(srna)
                    matchs[srna] = []
                else:
                    matchs[srna].append(line)
        srnas = srnas[:-1]
        for srna in srnas:
            out.write(">" + srna + "\n")
            for target in matchs[srna]:
                out.write(target + "\n")
        out.close()
        os.remove(out_rnaup)
        shutil.move("tmp.txt", out_rnaup)
        return srnas

    def _rnaup(self, prefixs, target_prefixs, args_tar, log):
        log.write("Using RNAup to predict sRNA targets.\n")
        log.write("Please make sure the version of Vienna RNA package is "
                  "at least 2.3.2.\n")
        for prefix in prefixs:
            srnas = []
            print("Running RNAup of {0}".format(prefix))
            if not os.path.exists(os.path.join(self.rnaup_path, prefix)):
                os.mkdir(os.path.join(self.rnaup_path, prefix))
            num_up = 0
            processes = []
            out_rnaup = os.path.join(self.rnaup_path, prefix,
                                     "_".join([prefix + "_RNAup.txt"]))
            out_log = os.path.join(self.rnaup_path, prefix,
                                   "_".join([prefix + "_RNAup.log"]))
            if "_".join([prefix, "RNAup.txt"]) in \
                    os.listdir(os.path.join(self.rnaup_path, prefix)):
                if not args_tar.continue_rnaup:
                    os.remove(out_rnaup)
                    os.remove(out_log)
                else:
                    log.write("The data from the previous run is found.\n")
                    srnas = self._get_continue(out_rnaup)
                    log.write("The previous data is loaded.\n")
            with open(
                    os.path.join(
                        self.srna_seq_path,
                        "_".join([self.tmps["tmp"], prefix, "sRNA.fa"])),
                    "r") as s_f:
                for line in s_f:
                    line = line.strip()
                    if line.startswith(">"):
                        if line[1:] in srnas:
                            start = False
                            continue
                        start = True
                        print("Running RNAup with {0}".format(line[1:]))
                        num_up += 1
                        out_up = open(
                            os.path.join(
                                args_tar.out_folder,
                                "".join([self.tmps["tmp"],
                                         str(num_up), ".fa"])), "w")
                        out_up.write(line + "\n")
                    else:
                        if start:
                            out_up.write(line + "\n")
                            out_up.close()
                            for prefix in target_prefixs:
                                self.helper.merge_file(
                                    os.path.join(
                                        self.target_seq_path,
                                        "_".join([prefix, "target.fa"])),
                                    os.path.join(
                                        args_tar.out_folder, "".join([
                                            self.tmps["tmp"],
                                            str(num_up), ".fa"
                                        ])))
                                if num_up == args_tar.core_up:
                                    self._run_rnaup(num_up, processes, prefix,
                                                    out_rnaup, out_log,
                                                    args_tar, log)
                                    processes = []
                                    num_up = 0
                self._run_rnaup(num_up, processes, prefix, out_rnaup, out_log,
                                args_tar, log)
            log.write("The prediction for {0} is done.\n".format(prefix))
            log.write("\t" + out_rnaup +
                      " is complete generated and updated.\n")

    def _intarna(self, prefixs, target_prefixs, args_tar, log):
        log.write("Using IntaRNA to predict sRNA targets.\n")
        log.write(
            "Please make sure the version of IntaRNA is at least 2.0.4.\n")
        all_target = os.path.join(self.target_seq_path, "all_target.fa")
        if os.path.exists(all_target):
            os.remove(all_target)
        for prefix in target_prefixs:
            self.helper.merge_file(
                os.path.join(self.target_seq_path, prefix + "_target.fa"),
                all_target)
        for prefix in prefixs:
            print("Running IntaRNA of {0}".format(prefix))
            intarna_file = os.path.join(self.intarna_path, prefix,
                                        prefix + "_IntaRNA.txt")
            self.helper.check_make_folder(
                os.path.join(self.intarna_path, prefix))
            call([
                args_tar.intarna_path, "-q",
                os.path.join(self.srna_seq_path,
                             "_".join([self.tmps["tmp"], prefix, "sRNA.fa"])),
                "-t", all_target, "--qAccW",
                str(args_tar.slide_win_srna), "--qAccL",
                str(args_tar.max_loop_srna), "--tAccW",
                str(args_tar.slide_win_target), "--tAccL",
                str(args_tar.max_loop_target), "--outMode", "C", "-m",
                args_tar.mode_intarna, "--threads",
                str(args_tar.core_inta), "--out", intarna_file
            ])
            log.write("The prediction for {0} is done.\n".format(prefix))
            log.write("\t" + intarna_file + " is generated.\n")

    def _merge_rnaplex_rnaup(self, prefixs, target_prefixs, args_tar, log):
        '''merge the result of IntaRNA, RNAup and RNAplex'''
        log.write(
            "Running merge_rnaplex_rnaup.py to merge the results from "
            "RNAplex, RNAup, and IntaRNA for generating finanl output.\n")
        log.write("The following files are generated:\n")
        all_gff = os.path.join(self.gff_path, "all.gff")
        if os.path.exists(all_gff):
            os.remove(all_gff)
        for prefix in target_prefixs:
            self.helper.merge_file(
                os.path.join(self.gff_path, prefix + ".gff"), all_gff)
        for prefix in prefixs:
            rnaplex_file = None
            rnaup_file = None
            out_rnaplex = None
            out_rnaup = None
            intarna_file = None
            out_intarna = None
            self.helper.check_make_folder(os.path.join(self.merge_path,
                                                       prefix))
            print("Ranking {0} now".format(prefix))
            if ("RNAplex" in args_tar.program):
                rnaplex_file = os.path.join(self.rnaplex_path, prefix,
                                            "_".join([prefix, "RNAplex.txt"]))
                out_rnaplex = os.path.join(
                    self.rnaplex_path, prefix,
                    "_".join([prefix, "RNAplex_rank.csv"]))
                self._remove_repeat(rnaplex_file, "RNAplex")
            if ("RNAup" in args_tar.program):
                rnaup_file = os.path.join(self.rnaup_path, prefix,
                                          "_".join([prefix, "RNAup.txt"]))
                out_rnaup = os.path.join(self.rnaup_path, prefix,
                                         "_".join([prefix, "RNAup_rank.csv"]))
                self._remove_repeat(rnaup_file, "RNAup")
            if ("IntaRNA" in args_tar.program):
                intarna_file = os.path.join(self.intarna_path, prefix,
                                            "_".join([prefix, "IntaRNA.txt"]))
                out_intarna = os.path.join(
                    self.intarna_path, prefix,
                    "_".join([prefix, "IntaRNA_rank.csv"]))
                self._remove_repeat(intarna_file, "IntaRNA")
            overlap_file = os.path.join(self.merge_path, prefix,
                                        "_".join([prefix, "overlap.csv"]))
            merge_file = os.path.join(self.merge_path, prefix,
                                      "_".join([prefix, "merge.csv"]))
            merge_srna_target(
                rnaplex_file, rnaup_file, intarna_file, args_tar, out_rnaplex,
                out_rnaup, out_intarna,
                os.path.join(self.fasta_path,
                             prefix + ".fa"), merge_file, overlap_file,
                os.path.join(self.srna_path, "_".join([prefix, "sRNA.gff"])),
                all_gff, target_prefixs)
            if ("RNAplex" in args_tar.program):
                log.write("\t" + out_rnaplex + "\n")
            if ("RNAup" in args_tar.program):
                log.write("\t" + out_rnaup + "\n")
            if ("IntaRNA" in args_tar.program):
                log.write("\t" + out_intarna + "\n")
            if (os.path.exists(merge_file)):
                log.write("\t" + merge_file + "\n")
            if (os.path.exists(overlap_file)):
                log.write("\t" + overlap_file + "\n")

    def _remove_rnaplex(self, line, num, pre_num, pre, checks, out_tmp,
                        print_):
        if (line.startswith(">")):
            if (num % 2 == 1):
                print_ = False
                pre = line
                if (line not in checks):
                    checks[line] = []
                    print_ = True
            elif (num % 2 == 0) and (line not in checks[pre]):
                checks[pre].append(line)
                print_ = True
            num = num + 1
        else:
            if (print_):
                if (num != pre_num):
                    out_tmp.write(pre + "\n")
                    out_tmp.write(checks[pre][-1] + "\n")
                out_tmp.write(line + "\n")
                pre_num = num
        return num, pre_num, print_, pre,

    def _remove_rnaup(self, line, pre, num, pre_num, srna_info, checks,
                      out_tmp, print_, tar):
        if (line.startswith(">")):
            print_ = False
            tar = False
            if (pre.startswith(">")):
                if (pre not in checks):
                    checks[pre] = [line]
                    srna_info = pre
                    print_ = True
                else:
                    if (line not in checks[pre]):
                        checks[pre].append(line)
                        print_ = True
            else:
                if (num != 1):
                    if (line not in checks[srna_info]):
                        checks[srna_info].append(line)
                        print_ = True
        else:
            if (print_):
                if (pre_num != len(checks)):
                    out_tmp.write(srna_info + "\n")
                    out_tmp.write(checks[srna_info][-1] + "\n")
                    out_tmp.write(line + "\n")
                else:
                    if (not tar):
                        out_tmp.write(checks[srna_info][-1] + "\n")
                    out_tmp.write(line + "\n")
                pre_num = len(checks)
                tar = True
        pre = line
        num = num + 1
        return num, pre_num, print_, pre, tar, srna_info

    def _remove_intarna(self, line, checks, tar, srna_info, seq, out_tmp):
        if (line.startswith(".")) or (line.startswith("(")) or (
                line.startswith(")")):
            seq = line.split(";")[0]
            if (seq not in checks[tar][srna_info]):
                checks[tar][srna_info].append(seq)
                out_tmp.write(line + "\n")
        else:
            if (len(line.split(";")) >= 8):
                tar = line.split(";")[0]
                srna_info = line.split(";")[3]
                seq = line.split(";")[7]
                if (tar not in checks):
                    checks[tar] = {}
                    checks[tar][srna_info] = [seq]
                    out_tmp.write(line + "\n")
                else:
                    if (srna_info not in checks[tar]):
                        checks[tar][srna_info] = [seq]
                        out_tmp.write(line + "\n")
        return tar, srna_info, seq

    def _remove_repeat(self, interact_file, type_):
        checks = {}
        seq = ""
        pre = ""
        srna_info = ""
        num = 1
        tar = False
        pre_num = 0
        print_ = False
        out_tmp = open(interact_file + "tmp", "w")
        with open(interact_file) as fh:
            for line in fh:
                line = line.strip()
                if (type_ == "RNAplex"):
                    num, pre_num, print_, pre = self._remove_rnaplex(
                        line, num, pre_num, pre, checks, out_tmp, print_)
                elif (type_ == "RNAup"):
                    num, pre_num, print_, pre, tar, srna_info = (
                        self._remove_rnaup(line, pre, num, pre_num, srna_info,
                                           checks, out_tmp, print_, tar))
                elif (type_ == "IntaRNA"):
                    tar, srna_info, seq = self._remove_intarna(
                        line, checks, tar, srna_info, seq, out_tmp)
        out_tmp.close()
        shutil.move(interact_file + "tmp", interact_file)

    def run_srna_target_prediction(self, args_tar, log):
        self._check_gff(args_tar.gffs)
        self._check_gff(args_tar.srnas)
        self.multiparser.parser_gff(args_tar.gffs, None)
        self.multiparser.parser_fasta(args_tar.fastas)
        self.multiparser.parser_gff(args_tar.srnas, "sRNA")
        prefixs = []
        target_prefixs = []
        self._gen_seq(prefixs, target_prefixs, args_tar)
        if ("RNAplex" in args_tar.program):
            self._rna_plex(prefixs, target_prefixs, args_tar, log)
        self.helper.remove_all_content(self.target_seq_path, "_target_",
                                       "file")
        shutil.rmtree(os.path.join(self.rnaplex_path, "tmp_RNAplfold"))
        log.write("The temporary files for running RNAplex are deleted.\n")
        if ("RNAup" in args_tar.program):
            self._rnaup(prefixs, target_prefixs, args_tar, log)
        if ("IntaRNA" in args_tar.program):
            self._intarna(prefixs, target_prefixs, args_tar, log)
        self._merge_rnaplex_rnaup(prefixs, target_prefixs, args_tar, log)
        self.helper.remove_all_content(args_tar.out_folder, self.tmps["tmp"],
                                       "dir")
        self.helper.remove_all_content(args_tar.out_folder, self.tmps["tmp"],
                                       "file")
        self.helper.remove_tmp_dir(args_tar.gffs)
        self.helper.remove_tmp_dir(args_tar.srnas)
        self.helper.remove_tmp_dir(args_tar.fastas)
        self.helper.remove_all_content(self.srna_seq_path, "tmp_", "file")
        os.remove(os.path.join(self.target_seq_path, "all_target.fa"))
示例#15
0
class SubLocal(object):

    def __init__(self, args_sub):
        self.multiparser = Multiparser()
        self.helper = Helper()
        self.fixer = FormatFixer()
        self.gff_path = os.path.join(args_sub.gffs, "tmp")
        self.fasta_path = os.path.join(args_sub.fastas, "tmp")
        if args_sub.trans is not None:
            self.tran_path = os.path.join(args_sub.trans, "tmp")
        else:
            self.tran_path = None
        self.out_all = os.path.join(args_sub.out_folder, "all_CDS")
        self.out_express = os.path.join(args_sub.out_folder, "expressed_CDS")
        self.all_tmp_path = os.path.join(self.out_all, "tmp")
        self.express_tmp_path = os.path.join(self.out_express, "tmp")
        self.all_stat_path = os.path.join(self.out_all, "statistics")
        self.express_stat_path = os.path.join(self.out_express, "statistics")
        self.all_tmp_result = os.path.join(self.out_all, "tmp_results")
        self.express_tmp_result = os.path.join(self.out_express, "tmp_results")
        self.all_result = os.path.join(self.out_all, "psortb_results")
        self.express_result = os.path.join(self.out_express, "psortb_results")
        self.endfix_table = "table.csv"
        self.endfix_raw = "raw.txt"
        self._make_folder()

    def _make_folder(self):
        self.helper.check_make_folder(self.out_all)
        self.helper.check_make_folder(self.out_express)
        self.helper.check_make_folder(self.all_stat_path)
        self.helper.check_make_folder(self.express_stat_path)
        self.helper.check_make_folder(self.all_result)
        self.helper.check_make_folder(self.express_result)

    def _compare_cds_tran(self, gff_file, tran_file):
        out = open(os.path.join(self.out_all, "tmp_cds.gff"), "w")
        cdss = []
        fh = open(gff_file)
        th = open(tran_file)
        for entry in Gff3Parser().entries(fh):
            if entry.feature == "CDS":
                cdss.append(entry)
        trans = []
        for entry in Gff3Parser().entries(th):
            trans.append(entry)
        for cds in cdss:
            for ta in trans:
                if (cds.strand == ta.strand) and (
                        cds.seq_id == ta.seq_id):
                    if ((cds.end < ta.end) and (
                             cds.end > ta.start) and (
                             cds.start <= ta.start)) or (
                            (cds.start > ta.start) and (
                             cds.start < ta.end) and (
                             cds.end >= ta.end)) or (
                            (cds.end >= ta.end) and (
                             cds.start <= ta.start)) or (
                            (cds.end <= ta.end) and (
                             cds.start >= ta.start)):
                        out.write(cds.info + "\n")
                        break
        fh.close()
        th.close()
        out.close()

    def _get_protein_seq(self, gff, tmp_path, tran_path):
        prefix = gff.replace(".gff", "")
        fasta = self.helper.get_correct_file(self.fasta_path, ".fa",
                                             prefix, None, None)
        dna_seq_file = os.path.join(tmp_path, "_".join([prefix, "dna.fa"]))
        print("Generate CDS fasta files of {0}".format(prefix))
        if tran_path is not None:
            self._compare_cds_tran(os.path.join(self.gff_path, gff),
                                   os.path.join(tran_path, "_".join([
                                       prefix, "transcript.gff"])))
            self.helper.get_cds_seq(os.path.join(self.out_all, "tmp_cds.gff"),
                                    fasta, dna_seq_file)
            os.remove(os.path.join(self.out_all, "tmp_cds.gff"))
        else:
            self.helper.get_cds_seq(os.path.join(self.gff_path, gff),
                                    fasta, dna_seq_file)
        print("transfer DNA seq to protein seq of {0}".format(prefix))
        self.helper.translation(dna_seq_file, "tmp")
        prot_seq_file = os.path.join(
                tmp_path, "_".join([prefix, "protein.fa"]))
        self.fixer.fix_emboss("tmp", prot_seq_file)
        os.remove("tmp")
        return prefix

    def _psortb(self, psortb_path, strain_type, prot_seq_file,
                out_raw, out_err):
        call([psortb_path, strain_type, prot_seq_file],
             stdout=out_raw, stderr=out_err)

    def _run_psortb(self, args_sub, prefix, out_folder, tmp_path, tmp_result):
        print("Running psortb of {0}".format(prefix))
        out_err = open(os.path.join(out_folder, "tmp_log"), "w")
        out_raw = open(os.path.join(tmp_result,
                       "_".join([prefix, self.endfix_raw])), "w")
        prot_seq_file = os.path.join(tmp_path,
                                     "_".join([prefix, "protein.fa"]))
        if args_sub.gram == "positive":
            self._psortb(args_sub.psortb_path, "-p", prot_seq_file,
                         out_raw, out_err)
        elif args_sub.gram == "negative":
            self._psortb(args_sub.psortb_path, "-n", prot_seq_file,
                         out_raw, out_err)
        else:
            print("Error:It is not a proper bacteria type - {0}!!".format(
                  args_sub.gram))
            sys.exit()
        out_err.close()
        out_raw.close()

    def _extract_result(self, args_sub, tmp_psortb_path, prefix, gff_file):
        if args_sub.merge:
            print("Merge to gff...")
            extract_psortb(os.path.join(
                tmp_psortb_path, "_".join([prefix, self.endfix_raw])),
                os.path.join(tmp_psortb_path, "_".join([
                    prefix, self.endfix_table])),
                gff_file, os.path.join(prefix + ".gff"),
                args_sub.fuzzy)
            shutil.move(prefix + ".gff", gff_file)
        else:
            extract_psortb(os.path.join(
                tmp_psortb_path, "_".join([prefix, self.endfix_raw])),
                os.path.join(tmp_psortb_path, "_".join([
                    prefix, self.endfix_table])),
                None, None, args_sub.fuzzy)

    def _merge_and_stat(self, gffs, tmp_psortb_path, stat_path, psortb_result):
        for folder in os.listdir(gffs):
            if folder.endswith(".gff_folder"):
                prefix = folder.replace(".gff_folder", "")
                self.helper.check_make_folder(
                     os.path.join(psortb_result, prefix))
                merge_table = os.path.join(
                        psortb_result, prefix,
                        "_".join([prefix, self.endfix_table]))
                for gff in os.listdir(os.path.join(gffs, folder)):
                    result = self.helper.get_correct_file(
                            tmp_psortb_path, "_" + self.endfix_raw,
                            gff.replace(".gff", ""), None, None)
                    shutil.copy(result, os.path.join(psortb_result, prefix))
                    result = self.helper.get_correct_file(
                            tmp_psortb_path, "_" + self.endfix_table,
                            gff.replace(".gff", ""), None, None)
                    self.helper.merge_file(result, merge_table)
                self.helper.check_make_folder(os.path.join(stat_path, prefix))
                stat_sublocal(merge_table,
                              os.path.join(
                                  stat_path, prefix, prefix),
                              os.path.join(
                                  stat_path, prefix, "_".join([
                                      "stat", prefix, "sublocal.csv"])))

    def _remove_tmps(self, args_sub):
        self.helper.remove_tmp(args_sub.fastas)
        self.helper.remove_tmp(args_sub.gffs)
        self.helper.remove_all_content(args_sub.out_folder, "tmp", "dir")
        self.helper.remove_all_content(self.out_all, "tmp", "dir")
        self.helper.remove_all_content(self.out_express, "tmp", "dir")
        os.remove(os.path.join(self.out_all, "tmp_log"))
        if args_sub.trans is not None:
            os.remove(os.path.join(self.out_express, "tmp_log"))

    def run_sub_local(self, args_sub):
        for gff in os.listdir(args_sub.gffs):
            if gff.endswith(".gff"):
                self.helper.check_uni_attributes(os.path.join(
                                                 args_sub.gffs, gff))
        self.multiparser.parser_gff(args_sub.gffs, None)
        self.multiparser.parser_fasta(args_sub.fastas)
        if args_sub.trans is not None:
            self.multiparser.parser_gff(args_sub.trans, "transcript")
            self.helper.check_make_folder(self.express_tmp_path)
            self.helper.check_make_folder(self.express_tmp_result)
        self.helper.check_make_folder(self.all_tmp_path)
        self.helper.check_make_folder(self.all_tmp_result)
        for gff in os.listdir(self.gff_path):
            if args_sub.trans is not None:
                print("Running expressed gene now...")
                prefix = self._get_protein_seq(gff, self.express_tmp_path,
                                               self.tran_path)
                self._run_psortb(args_sub, prefix, self.out_express,
                                 self.express_tmp_path,
                                 self.express_tmp_result)
                self._extract_result(args_sub, self.express_tmp_result, prefix,
                                     os.path.join(self.gff_path, gff))
            print("Running all gene now...")
            prefix = self._get_protein_seq(gff, self.all_tmp_path, None)
            self._run_psortb(args_sub, prefix, self.out_all,
                             self.all_tmp_path, self.all_tmp_result)
            self._extract_result(args_sub, self.all_tmp_result, prefix,
                                 os.path.join(self.gff_path, gff))
        self._merge_and_stat(args_sub.gffs, self.all_tmp_result,
                             self.all_stat_path, self.all_result)
        if args_sub.trans is not None:
            self._merge_and_stat(args_sub.gffs, self.express_tmp_result,
                                 self.express_stat_path, self.express_result)
        self._remove_tmps(args_sub)
示例#16
0
class SubLocal(object):
    '''detection of subcellular localization'''

    def __init__(self, args_sub):
        self.multiparser = Multiparser()
        self.helper = Helper()
        self.fixer = FormatFixer()
        self.gff_path = os.path.join(args_sub.gffs, "tmp")
        self.fasta_path = os.path.join(args_sub.fastas, "tmp")
        if args_sub.trans is not None:
            self.tran_path = os.path.join(args_sub.trans, "tmp")
        else:
            self.tran_path = None
        self.out_all = os.path.join(args_sub.out_folder, "all_CDSs")
        self.out_express = os.path.join(args_sub.out_folder, "expressed_CDSs")
        self.all_tmp_path = os.path.join(self.out_all, "tmp")
        self.express_tmp_path = os.path.join(self.out_express, "tmp")
        self.all_stat_path = os.path.join(self.out_all, "statistics")
        self.express_stat_path = os.path.join(self.out_express, "statistics")
        self.all_tmp_result = os.path.join(self.out_all, "tmp_results")
        self.express_tmp_result = os.path.join(self.out_express, "tmp_results")
        self.all_result = os.path.join(self.out_all, "psortb_results")
        self.express_result = os.path.join(self.out_express, "psortb_results")
        self.endfix_table = "table.csv"
        self.endfix_raw = "raw.txt"
        self._make_folder()

    def _make_folder(self):
        self.helper.check_make_folder(self.out_all)
        self.helper.check_make_folder(self.out_express)
        self.helper.check_make_folder(self.all_stat_path)
        self.helper.check_make_folder(self.express_stat_path)
        self.helper.check_make_folder(self.all_result)
        self.helper.check_make_folder(self.express_result)

    def _compare_cds_tran(self, gff_file, tran_file, log):
        '''compare CDS and transcript to find the expressed CDS'''
        log.write("Comparing transcripts and CDSs to get expressed CDSs.\n")
        out = open(os.path.join(self.out_all, "tmp_cds.gff"), "w")
        cdss = []
        fh = open(gff_file)
        th = open(tran_file)
        for entry in Gff3Parser().entries(fh):
            if entry.feature == "CDS":
                cdss.append(entry)
        trans = []
        for entry in Gff3Parser().entries(th):
            trans.append(entry)
        for cds in cdss:
            for ta in trans:
                if (cds.strand == ta.strand) and (
                        cds.seq_id == ta.seq_id):
                    if ((cds.end < ta.end) and (
                             cds.end > ta.start) and (
                             cds.start <= ta.start)) or (
                            (cds.start > ta.start) and (
                             cds.start < ta.end) and (
                             cds.end >= ta.end)) or (
                            (cds.end >= ta.end) and (
                             cds.start <= ta.start)) or (
                            (cds.end <= ta.end) and (
                             cds.start >= ta.start)):
                        out.write(cds.info + "\n")
                        break
        fh.close()
        th.close()
        out.close()
        log.write("\t" + os.path.join(self.out_all, "tmp_cds.gff") + " is "
                  "temporary generated.\n")

    def _get_protein_seq(self, gff, tmp_path, tran_path, args_sub, log):
        prefix = gff.replace(".gff", "")
        fasta = self.helper.get_correct_file(self.fasta_path, ".fa",
                                             prefix, None, None)
        dna_seq_file = os.path.join(tmp_path, "_".join([prefix, "dna.fa"]))
        print("Generating CDS fasta files of {0}".format(prefix))
        if tran_path is not None:
            log.write("Predicting subcellular localization for expressed "
                      "CDSs for {0}.\n".format(prefix))
            self._compare_cds_tran(os.path.join(self.gff_path, gff),
                                   os.path.join(tran_path, "_".join([
                                       prefix, "transcript.gff"])), log)
            log.write("Running helper.py to extract sequences for CDSs.\n")
            self.helper.get_cds_seq(os.path.join(self.out_all, "tmp_cds.gff"),
                                    fasta, dna_seq_file)
            os.remove(os.path.join(self.out_all, "tmp_cds.gff"))
        else:
            log.write("Predicting subcellular localization for all CDSs for "
                      "{0}.\n".format(prefix))
            log.write("Running helper.py to extract sequences for CDSs.\n")
            self.helper.get_cds_seq(os.path.join(self.gff_path, gff),
                                    fasta, dna_seq_file)
        log.write("\t" + dna_seq_file + " is generated.\n")
        print("Transfering DNA sequences to protein sequence of {0}".format(
            prefix))
        log.write("Running helper.py to translate DNA sequences to Protein "
                  "sequences.\n")
        tmp_file = os.path.join(args_sub.out_folder, "tmp")
        self.helper.translation(dna_seq_file, tmp_file)
        prot_seq_file = os.path.join(
                tmp_path, "_".join([prefix, "protein.fa"]))
        self.fixer.fix_emboss(tmp_file, prot_seq_file)
        log.write(prot_seq_file + " is generated.\n")
        os.remove(tmp_file)
        return prefix

    def _psortb(self, psortb_path, strain_type, prot_seq_file,
                out_raw, out_err, log):
        log.write(" ".join([psortb_path, strain_type, prot_seq_file]) + "\n")
        call([psortb_path, strain_type, prot_seq_file],
             stdout=out_raw, stderr=out_err)

    def _run_psortb(self, args_sub, prefix, out_folder, tmp_path, tmp_result, log):
        print("Running psortb of {0}".format(prefix))
        log.write("Running Psortb for predict subcellular localization for "
                  "{0}.\n".format(prefix))
        out_err = open(os.path.join(out_folder, "tmp_log"), "w")
        out_raw = open(os.path.join(tmp_result,
                       "_".join([prefix, self.endfix_raw])), "w")
        prot_seq_file = os.path.join(tmp_path,
                                     "_".join([prefix, "protein.fa"]))
        if args_sub.gram == "positive":
            self._psortb(args_sub.psortb_path, "-p", prot_seq_file,
                         out_raw, out_err, log)
        elif args_sub.gram == "negative":
            self._psortb(args_sub.psortb_path, "-n", prot_seq_file,
                         out_raw, out_err, log)
        else:
            log.write("Please assign \"positive\" or \"negative\" to "
                      "--bacteria_type.\n")
            print("Error: {0} is not a proper bacteria type! "
                  "Please assign positive or negative.".format(
                  args_sub.gram))
            sys.exit()
        log.write("\t" + os.path.join(tmp_result, "_".join([
            prefix, self.endfix_raw])) + " is temporary generated.\n")
        out_err.close()
        out_raw.close()

    def _extract_result(self, args_sub, tmp_psortb_path, prefix, gff_file, log):
        '''extract the result of psortb'''
        log.write("Running extract_psortb.py to extract the information of "
                  "localization.\n")
        extract_psortb(os.path.join(
            tmp_psortb_path, "_".join([prefix, self.endfix_raw])),
            os.path.join(tmp_psortb_path, "_".join([
                prefix, self.endfix_table])),
            None, None, args_sub.fuzzy)
        log.write("\t" + os.path.join(tmp_psortb_path, "_".join([
            prefix, self.endfix_table])) + " is tempoaray generated.\n")

    def _remove_header(self, out_all):
        out = open(out_all + "_tmp", "w")
        fh = open(out_all, "r")
        out.write("\t".join(["#Genome", "Protein", "Strand", "Start",
                             "End", "Location", "Score"]) + "\n")
        for row in csv.reader(fh, delimiter='\t'):
            if row[0] != "#Genome":
                out.write("\t".join(row) + "\n")
        out.close()
        fh.close()
        shutil.move(out_all + "_tmp", out_all)

    def _merge_and_stat(self, gffs, tmp_psortb_path, stat_path, psortb_result,
                        log):
        for folder in os.listdir(gffs):
            if folder.endswith(".gff_folder"):
                prefix = folder.replace(".gff_folder", "")
                self.helper.check_make_folder(
                     os.path.join(psortb_result, prefix))
                merge_table = os.path.join(
                        psortb_result, prefix,
                        "_".join([prefix, self.endfix_table]))
                for gff in os.listdir(os.path.join(gffs, folder)):
                    result = self.helper.get_correct_file(
                            tmp_psortb_path, "_" + self.endfix_raw,
                            gff.replace(".gff", ""), None, None)
                    shutil.copy(result, os.path.join(psortb_result, prefix))
                    result = self.helper.get_correct_file(
                            tmp_psortb_path, "_" + self.endfix_table,
                            gff.replace(".gff", ""), None, None)
                    self.helper.merge_file(result, merge_table)
                log.write("\t" + merge_table + "\n")
                self._remove_header(merge_table)
                self.helper.check_make_folder(os.path.join(stat_path, prefix))
                stat_folder = os.path.join(stat_path, prefix)
                stat_file = os.path.join(stat_folder, "_".join([
                                      "stat", prefix, "sublocal.csv"]))
                stat_sublocal(merge_table,
                              os.path.join(stat_folder, prefix),
                              stat_file)
                for file_ in os.listdir(stat_folder):
                    log.write("\t" + os.path.join(stat_folder, file_) + "\n")

    def _remove_tmps(self, args_sub):
        self.helper.remove_tmp_dir(args_sub.fastas)
        self.helper.remove_tmp_dir(args_sub.gffs)
        self.helper.remove_all_content(args_sub.out_folder, "tmp", "dir")
        self.helper.remove_all_content(self.out_all, "tmp", "dir")
        self.helper.remove_all_content(self.out_express, "tmp", "dir")
        os.remove(os.path.join(self.out_all, "tmp_log"))
        if args_sub.trans is not None:
            os.remove(os.path.join(self.out_express, "tmp_log"))
            self.helper.remove_tmp_dir(args_sub.trans)

    def run_sub_local(self, args_sub, log):
        for gff in os.listdir(args_sub.gffs):
            if gff.endswith(".gff"):
                self.helper.check_uni_attributes(os.path.join(
                                                 args_sub.gffs, gff))
        self.multiparser.parser_gff(args_sub.gffs, None)
        self.multiparser.parser_fasta(args_sub.fastas)
        if args_sub.trans is not None:
            self.multiparser.parser_gff(args_sub.trans, "transcript")
            self.helper.check_make_folder(self.express_tmp_path)
            self.helper.check_make_folder(self.express_tmp_result)
        self.helper.check_make_folder(self.all_tmp_path)
        self.helper.check_make_folder(self.all_tmp_result)
        for gff in os.listdir(self.gff_path):
            if args_sub.trans is not None:
                print("Running expressed genes now")
                prefix = self._get_protein_seq(gff, self.express_tmp_path,
                                               self.tran_path, args_sub, log)
                self._run_psortb(args_sub, prefix, self.out_express,
                                 self.express_tmp_path,
                                 self.express_tmp_result, log)
                self._extract_result(args_sub, self.express_tmp_result, prefix,
                                     os.path.join(self.gff_path, gff), log)
            print("Running all genes now")
            prefix = self._get_protein_seq(gff, self.all_tmp_path, None,
                                           args_sub, log)
            self._run_psortb(args_sub, prefix, self.out_all,
                             self.all_tmp_path, self.all_tmp_result, log)
            self._extract_result(args_sub, self.all_tmp_result, prefix,
                                 os.path.join(self.gff_path, gff), log)
        log.write("Running stat_sublocal.py to do statistics, generate "
                  "merged tables, and plot figures.\n")
        log.write("The following files are generated:\n")
        self._merge_and_stat(args_sub.gffs, self.all_tmp_result,
                             self.all_stat_path, self.all_result, log)
        if args_sub.trans is not None:
            self._merge_and_stat(args_sub.gffs, self.express_tmp_result,
                                 self.express_stat_path, self.express_result, log)
        self._remove_tmps(args_sub)
示例#17
0
class RATT(object):
    '''annotation transfer'''

    def __init__(self, args_ratt):
        self.multiparser = Multiparser()
        self.converter = Converter()
        self.format_fixer = FormatFixer()
        self.helper = Helper()
        if args_ratt.ref_gbk:
            self.gbk = os.path.join(args_ratt.ref_gbk, "gbk_tmp")
            self.gbk_tmp = os.path.join(self.gbk, "tmp")
            self.embl = os.path.join(args_ratt.ref_gbk, "embls")
        if args_ratt.ref_embls:
            self.embl = args_ratt.ref_embls
        self.ratt_log = os.path.join(args_ratt.output_path, "ratt_log.txt")
        self.tmp_files = {"tar": os.path.join(args_ratt.tar_fastas, "tmp"),
                          "ref": os.path.join(args_ratt.ref_fastas, "tmp"),
                          "out_gff": os.path.join(args_ratt.gff_outfolder,
                                                  "tmp"),
                          "gff": os.path.join(args_ratt.gff_outfolder,
                                              "tmp.gff"),
                          "ptt": os.path.join(args_ratt.gff_outfolder,
                                              "tmp.ptt"),
                          "rnt": os.path.join(args_ratt.gff_outfolder,
                                              "tmp.rnt")}

    def _convert_to_pttrnt(self, gffs, files, log):
        for gff in files:
            if gff.endswith(".gff"):
                gff = os.path.join(gffs, gff)
                filename = gff.split("/")
                prefix = filename[-1][:-4]
                rnt = gff[:-3] + "rnt"
                ptt = gff[:-3] + "ptt"
                fasta = self.helper.get_correct_file(self.tmp_files["tar"],
                                                     ".fa", prefix, None, None)
                if fasta:
                    self.converter.convert_gff2rntptt(gff, fasta, ptt, rnt,
                                                      None, None)
                    log.write("\t" + ptt + " is generated.\n")
                    log.write("\t" + rnt + " is generated.\n")

    def _remove_files(self, args_ratt, out_gbk, log):
        self.helper.remove_all_content(args_ratt.gff_outfolder, ".gff", "file")
        self.helper.remove_all_content(args_ratt.gff_outfolder, ".ptt", "file")
        self.helper.remove_all_content(args_ratt.gff_outfolder, ".rnt", "file")
        log.write("Moving the final output files to {0}.\n".format(args_ratt.gff_outfolder))
        self.helper.move_all_content(self.tmp_files["out_gff"],
                                     args_ratt.gff_outfolder, None)
        log.write("Remove the temperary files.\n")
        shutil.rmtree(self.tmp_files["out_gff"])
        shutil.rmtree(self.tmp_files["tar"])
        shutil.rmtree(self.tmp_files["ref"])
        self.helper.remove_tmp_dir(args_ratt.tar_fastas)
        self.helper.remove_tmp_dir(args_ratt.ref_fastas)
        self.helper.remove_tmp_dir(args_ratt.ref_embls)
        self.helper.remove_tmp_dir(args_ratt.ref_gbk)

    def _convert_to_gff(self, ratt_result, args_ratt, files, log):
        name = ratt_result.split(".")
        filename = ".".join(name[1:-2]) + ".gff"
        output_file = os.path.join(args_ratt.output_path, filename)
        self.converter.convert_embl2gff(
             os.path.join(args_ratt.output_path, ratt_result), output_file)
        self.format_fixer.fix_ratt(output_file, ".".join(name[1:-2]),
                                   "tmp_gff")
        shutil.move("tmp_gff", output_file)
        shutil.copy(output_file, os.path.join(args_ratt.gff_outfolder,
                                              filename))
        log.write("\t" + os.path.join(args_ratt.gff_outfolder, filename) + 
                  " is generated.\n")
        files.append(filename)

    def _parser_embl_gbk(self, files):
        self.helper.check_make_folder(self.gbk)
        for file_ in files:
            close = False
            with open(file_, "r") as f_h:
                for line in f_h:
                    if (line.startswith("LOCUS")):
                        out = open(self.gbk_tmp, "w")
                        datas = line.split(" ")
                        for data in datas:
                            if (len(data) != 0) and (data != "LOCUS"):
                                filename = ".".join([data.strip(), "gbk"])
                                break
                    elif (line.startswith("VERSION")):
                        datas = line.split(" ")
                        for data in datas:
                            if (len(data) != 0) and (data != "VERSION"):
                                new_filename = ".".join([data.strip(), "gbk"])
                                break
                        if new_filename.find(filename):
                            filename = new_filename
                    if out:
                        out.write(line)
                    if line.startswith("//"):
                        out.close()
                        close = True
                        shutil.move(self.gbk_tmp,
                                    os.path.join(self.gbk, filename))
            if not close:
                out.close()
        return self.gbk

    def _convert_embl(self, ref_embls, log):
        '''convert gbk to embl'''
        detect_gbk = False
        gbks = []
        out_gbk = None
        for embl in os.listdir(ref_embls):
            if (embl.endswith(".gbk")) or (
                    embl.endswith(".gbff")) or (
                    embl.endswith(".gb")):
                detect_gbk = True
                gbks.append(os.path.join(ref_embls, embl))
        if not detect_gbk:
            log.write("--related_gbk_files is assigned, but not gbk files are detected.\n"
                      "The gbk file names need to be ended at .gbk, .gb, or .gbff. \n")
            print("Error: Please assign proper Genebank files!")
            sys.exit()
        elif detect_gbk:
            out_gbk = self._parser_embl_gbk(gbks)
            log.write("Running converter.py to convert gbk file to embl format.\n")
            self.converter.convert_gbk2embl(out_gbk)
            self.helper.check_make_folder(self.embl)
            self.helper.move_all_content(out_gbk, self.embl, [".embl"])
            log.write("\t" + self.embl + " is generated and the embl files are stored in it.\n")
        return out_gbk

    def _run_ratt(self, args_ratt, tar, ref, out, log):
        if (not os.path.exists(self.embl)) or (
                not os.path.exists(os.path.join(
                    self.tmp_files["tar"], tar + ".fa"))) or (
                not os.path.exists(os.path.join(
                    self.tmp_files["ref"], ref + ".fa"))):
            print("Error: Please check --compare_pair, the strain names "
                  "should be the same as the strain names in fasta, "
                  "genbank or embl files!")
            log.write("The strain names in --compare_pair should be the same "
                      "as the strain names in fasta, genbank, or embl files.\n")
            sys.exit()
        log.write("Make sure your RATT version is at least 1.64.\n")
        log.write("If the RATT can not run properly, please check the "
                  "RATT_HOME and PAGIT_HOME is assigned correctly.\n")
        log.write(" ".join([args_ratt.ratt_path, self.embl,
              os.path.join(self.tmp_files["tar"], tar + ".fa"),
              args_ratt.element, args_ratt.transfer_type,
              os.path.join(self.tmp_files["ref"], ref + ".fa")]) + "\n")
        call([args_ratt.ratt_path, self.embl,
              os.path.join(self.tmp_files["tar"], tar + ".fa"),
              args_ratt.element, args_ratt.transfer_type,
              os.path.join(self.tmp_files["ref"], ref + ".fa")],
             stdout=out, stderr=DEVNULL)
        log.write("Done!\n")

    def _format_and_run(self, args_ratt, log):
        print("Running RATT")
        for pair in args_ratt.pairs:
            ref = pair.split(":")[0]
            tar = pair.split(":")[1]
            out = open(self.ratt_log, "w+")
            self._run_ratt(args_ratt, tar, ref, out, log)
            log.write("The following files are generatd:\n")
            for filename in os.listdir():
                if ("final" in filename):
                    log.write("\t" + filename + "\n")
                    shutil.move(filename, os.path.join(args_ratt.output_path,
                                                       filename))
                elif (args_ratt.element in filename) or (
                      "query" in filename) or (
                      "Reference" in filename) or (
                      "Query" in filename) or (
                      "Sequences" in filename):
                    log.write("\t" + filename + "\n")
                    if os.path.isfile(filename):
                        os.remove(filename)
                    if os.path.isdir(filename):
                        shutil.rmtree(filename)
        out.close()

    def annotation_transfer(self, args_ratt, log):
        self.multiparser.parser_fasta(args_ratt.tar_fastas)
        self.multiparser.parser_fasta(args_ratt.ref_fastas)
        out_gbk = None
        if args_ratt.ref_embls is None:
            out_gbk = self._convert_embl(args_ratt.ref_gbki, log)
        self._format_and_run(args_ratt, log)
        files = []
        for data in os.listdir(args_ratt.output_path):
            if "final.embl" in data:
                log.write("Running converter.py to convert embl "
                          "files in {0} to gff, ptt, and rnt format.\n".format(data))
                self._convert_to_gff(data, args_ratt, files, log)
                self._convert_to_pttrnt(args_ratt.gff_outfolder, files, log)
        self.helper.check_make_folder(self.tmp_files["out_gff"])
        log.write("Merging the output of {0}.\n".format(data))
        for folder in os.listdir(args_ratt.tar_fastas):
            files = []
            if "_folder" in folder:
                datas = folder.split("_folder")
                prefix = ".".join(datas[0].split(".")[:-1])
                for file_ in os.listdir(os.path.join(args_ratt.tar_fastas,
                                                     folder)):
                    files.append(file_[:-3])
                for gff in os.listdir(args_ratt.gff_outfolder):
                    for file_ in files:
                        if (".gff" in gff) and (file_ == gff[:-4]):
                            self.helper.merge_file(os.path.join(
                                 args_ratt.gff_outfolder, gff),
                                 self.tmp_files["gff"])
                        if (".ptt" in gff) and (file_ == gff[:-4]):
                            self.helper.merge_file(os.path.join(
                                 args_ratt.gff_outfolder, gff),
                                 self.tmp_files["ptt"])
                        if (".rnt" in gff) and (file_ == gff[:-4]):
                            self.helper.merge_file(os.path.join(
                                 args_ratt.gff_outfolder, gff),
                                 self.tmp_files["rnt"])
                if os.path.exists(self.tmp_files["gff"]):
                    shutil.move(self.tmp_files["gff"], os.path.join(
                                self.tmp_files["out_gff"], prefix + ".gff"))
                    shutil.move(self.tmp_files["ptt"], os.path.join(
                                self.tmp_files["out_gff"], prefix + ".ptt"))
                    shutil.move(self.tmp_files["rnt"], os.path.join(
                                self.tmp_files["out_gff"], prefix + ".rnt"))
                else:
                    print("Error: Please check your fasta or "
                          "annotation files, they should only contain "
                          "the query genome. And make sure your RATT can "
                          "work properly (check $ANNOgesic/output/"
                          "annotation_transfer/ratt_log.txt).")
                    log.write("Please check your fasta or "
                              "annotation files, they should only contain "
                              "the query genome. And make sure your RATT can "
                              "work properly (check $ANNOgesic/output/"
                              "annotation_transfer/ratt_log.txt).\n")
        self._remove_files(args_ratt, out_gbk, log)
示例#18
0
class SubLocal(object):
    '''detection of subcellular localization'''

    def __init__(self, args_sub):
        self.multiparser = Multiparser()
        self.helper = Helper()
        self.fixer = FormatFixer()
        self.gff_path = os.path.join(args_sub.gffs, "tmp")
        self.fasta_path = os.path.join(args_sub.fastas, "tmp")
        if args_sub.trans is not None:
            self.tran_path = os.path.join(args_sub.trans, "tmp")
        else:
            self.tran_path = None
        self.out_all = os.path.join(args_sub.out_folder, "all_CDS")
        self.out_express = os.path.join(args_sub.out_folder, "expressed_CDS")
        self.all_tmp_path = os.path.join(self.out_all, "tmp")
        self.express_tmp_path = os.path.join(self.out_express, "tmp")
        self.all_stat_path = os.path.join(self.out_all, "statistics")
        self.express_stat_path = os.path.join(self.out_express, "statistics")
        self.all_tmp_result = os.path.join(self.out_all, "tmp_results")
        self.express_tmp_result = os.path.join(self.out_express, "tmp_results")
        self.all_result = os.path.join(self.out_all, "psortb_results")
        self.express_result = os.path.join(self.out_express, "psortb_results")
        self.endfix_table = "table.csv"
        self.endfix_raw = "raw.txt"
        self._make_folder()

    def _make_folder(self):
        self.helper.check_make_folder(self.out_all)
        self.helper.check_make_folder(self.out_express)
        self.helper.check_make_folder(self.all_stat_path)
        self.helper.check_make_folder(self.express_stat_path)
        self.helper.check_make_folder(self.all_result)
        self.helper.check_make_folder(self.express_result)

    def _compare_cds_tran(self, gff_file, tran_file):
        '''compare CDS and transcript to find the expressed CDS'''
        out = open(os.path.join(self.out_all, "tmp_cds.gff"), "w")
        cdss = []
        fh = open(gff_file)
        th = open(tran_file)
        for entry in Gff3Parser().entries(fh):
            if entry.feature == "CDS":
                cdss.append(entry)
        trans = []
        for entry in Gff3Parser().entries(th):
            trans.append(entry)
        for cds in cdss:
            for ta in trans:
                if (cds.strand == ta.strand) and (
                        cds.seq_id == ta.seq_id):
                    if ((cds.end < ta.end) and (
                             cds.end > ta.start) and (
                             cds.start <= ta.start)) or (
                            (cds.start > ta.start) and (
                             cds.start < ta.end) and (
                             cds.end >= ta.end)) or (
                            (cds.end >= ta.end) and (
                             cds.start <= ta.start)) or (
                            (cds.end <= ta.end) and (
                             cds.start >= ta.start)):
                        out.write(cds.info + "\n")
                        break
        fh.close()
        th.close()
        out.close()

    def _get_protein_seq(self, gff, tmp_path, tran_path):
        prefix = gff.replace(".gff", "")
        fasta = self.helper.get_correct_file(self.fasta_path, ".fa",
                                             prefix, None, None)
        dna_seq_file = os.path.join(tmp_path, "_".join([prefix, "dna.fa"]))
        print("Generating CDS fasta files of {0}".format(prefix))
        if tran_path is not None:
            self._compare_cds_tran(os.path.join(self.gff_path, gff),
                                   os.path.join(tran_path, "_".join([
                                       prefix, "transcript.gff"])))
            self.helper.get_cds_seq(os.path.join(self.out_all, "tmp_cds.gff"),
                                    fasta, dna_seq_file)
            os.remove(os.path.join(self.out_all, "tmp_cds.gff"))
        else:
            self.helper.get_cds_seq(os.path.join(self.gff_path, gff),
                                    fasta, dna_seq_file)
        print("Transfering DNA seq to protein seq of {0}".format(prefix))
        self.helper.translation(dna_seq_file, "tmp")
        prot_seq_file = os.path.join(
                tmp_path, "_".join([prefix, "protein.fa"]))
        self.fixer.fix_emboss("tmp", prot_seq_file)
        os.remove("tmp")
        return prefix

    def _psortb(self, psortb_path, strain_type, prot_seq_file,
                out_raw, out_err):
        call([psortb_path, strain_type, prot_seq_file],
             stdout=out_raw, stderr=out_err)

    def _run_psortb(self, args_sub, prefix, out_folder, tmp_path, tmp_result):
        print("Running psortb of {0}".format(prefix))
        out_err = open(os.path.join(out_folder, "tmp_log"), "w")
        out_raw = open(os.path.join(tmp_result,
                       "_".join([prefix, self.endfix_raw])), "w")
        prot_seq_file = os.path.join(tmp_path,
                                     "_".join([prefix, "protein.fa"]))
        if args_sub.gram == "positive":
            self._psortb(args_sub.psortb_path, "-p", prot_seq_file,
                         out_raw, out_err)
        elif args_sub.gram == "negative":
            self._psortb(args_sub.psortb_path, "-n", prot_seq_file,
                         out_raw, out_err)
        else:
            print("Error: It is not a proper bacteria type - {0}!!".format(
                  args_sub.gram))
            sys.exit()
        out_err.close()
        out_raw.close()

    def _extract_result(self, args_sub, tmp_psortb_path, prefix, gff_file):
        '''extract the result of psortb'''
        if args_sub.merge:
            print("Merging gff")
            extract_psortb(os.path.join(
                tmp_psortb_path, "_".join([prefix, self.endfix_raw])),
                os.path.join(tmp_psortb_path, "_".join([
                    prefix, self.endfix_table])),
                gff_file, os.path.join(prefix + ".gff"),
                args_sub.fuzzy)
            shutil.move(prefix + ".gff", gff_file)
        else:
            extract_psortb(os.path.join(
                tmp_psortb_path, "_".join([prefix, self.endfix_raw])),
                os.path.join(tmp_psortb_path, "_".join([
                    prefix, self.endfix_table])),
                None, None, args_sub.fuzzy)

    def _merge_and_stat(self, gffs, tmp_psortb_path, stat_path, psortb_result):
        for folder in os.listdir(gffs):
            if folder.endswith(".gff_folder"):
                prefix = folder.replace(".gff_folder", "")
                self.helper.check_make_folder(
                     os.path.join(psortb_result, prefix))
                merge_table = os.path.join(
                        psortb_result, prefix,
                        "_".join([prefix, self.endfix_table]))
                for gff in os.listdir(os.path.join(gffs, folder)):
                    result = self.helper.get_correct_file(
                            tmp_psortb_path, "_" + self.endfix_raw,
                            gff.replace(".gff", ""), None, None)
                    shutil.copy(result, os.path.join(psortb_result, prefix))
                    result = self.helper.get_correct_file(
                            tmp_psortb_path, "_" + self.endfix_table,
                            gff.replace(".gff", ""), None, None)
                    self.helper.merge_file(result, merge_table)
                self.helper.check_make_folder(os.path.join(stat_path, prefix))
                stat_sublocal(merge_table,
                              os.path.join(
                                  stat_path, prefix, prefix),
                              os.path.join(
                                  stat_path, prefix, "_".join([
                                      "stat", prefix, "sublocal.csv"])))

    def _remove_tmps(self, args_sub):
        self.helper.remove_tmp_dir(args_sub.fastas)
        self.helper.remove_tmp_dir(args_sub.gffs)
        self.helper.remove_all_content(args_sub.out_folder, "tmp", "dir")
        self.helper.remove_all_content(self.out_all, "tmp", "dir")
        self.helper.remove_all_content(self.out_express, "tmp", "dir")
        os.remove(os.path.join(self.out_all, "tmp_log"))
        if args_sub.trans is not None:
            os.remove(os.path.join(self.out_express, "tmp_log"))
            self.helper.remove_tmp_dir(args_sub.trans)

    def run_sub_local(self, args_sub):
        for gff in os.listdir(args_sub.gffs):
            if gff.endswith(".gff"):
                self.helper.check_uni_attributes(os.path.join(
                                                 args_sub.gffs, gff))
        self.multiparser.parser_gff(args_sub.gffs, None)
        self.multiparser.parser_fasta(args_sub.fastas)
        if args_sub.trans is not None:
            self.multiparser.parser_gff(args_sub.trans, "transcript")
            self.helper.check_make_folder(self.express_tmp_path)
            self.helper.check_make_folder(self.express_tmp_result)
        self.helper.check_make_folder(self.all_tmp_path)
        self.helper.check_make_folder(self.all_tmp_result)
        for gff in os.listdir(self.gff_path):
            if args_sub.trans is not None:
                print("Running expressed gene now")
                prefix = self._get_protein_seq(gff, self.express_tmp_path,
                                               self.tran_path)
                self._run_psortb(args_sub, prefix, self.out_express,
                                 self.express_tmp_path,
                                 self.express_tmp_result)
                self._extract_result(args_sub, self.express_tmp_result, prefix,
                                     os.path.join(self.gff_path, gff))
            print("Running all gene now")
            prefix = self._get_protein_seq(gff, self.all_tmp_path, None)
            self._run_psortb(args_sub, prefix, self.out_all,
                             self.all_tmp_path, self.all_tmp_result)
            self._extract_result(args_sub, self.all_tmp_result, prefix,
                                 os.path.join(self.gff_path, gff))
        self._merge_and_stat(args_sub.gffs, self.all_tmp_result,
                             self.all_stat_path, self.all_result)
        if args_sub.trans is not None:
            self._merge_and_stat(args_sub.gffs, self.express_tmp_result,
                                 self.express_stat_path, self.express_result)
        self._remove_tmps(args_sub)
示例#19
0
class SubLocal(object):
    '''detection of subcellular localization'''
    def __init__(self, args_sub):
        self.multiparser = Multiparser()
        self.helper = Helper()
        self.fixer = FormatFixer()
        self.gff_path = os.path.join(args_sub.gffs, "tmp")
        self.fasta_path = os.path.join(args_sub.fastas, "tmp")
        if args_sub.trans is not None:
            self.tran_path = os.path.join(args_sub.trans, "tmp")
        else:
            self.tran_path = None
        self.out_all = os.path.join(args_sub.out_folder, "all_CDSs")
        self.out_express = os.path.join(args_sub.out_folder, "expressed_CDSs")
        self.all_tmp_path = os.path.join(self.out_all, "tmp")
        self.express_tmp_path = os.path.join(self.out_express, "tmp")
        self.all_stat_path = os.path.join(self.out_all, "statistics")
        self.express_stat_path = os.path.join(self.out_express, "statistics")
        self.all_tmp_result = os.path.join(self.out_all, "tmp_results")
        self.express_tmp_result = os.path.join(self.out_express, "tmp_results")
        self.all_result = os.path.join(self.out_all, "psortb_results")
        self.express_result = os.path.join(self.out_express, "psortb_results")
        self.endfix_table = "table.csv"
        self.endfix_raw = "raw.txt"
        self._make_folder()

    def _make_folder(self):
        self.helper.check_make_folder(self.out_all)
        self.helper.check_make_folder(self.out_express)
        self.helper.check_make_folder(self.all_stat_path)
        self.helper.check_make_folder(self.express_stat_path)
        self.helper.check_make_folder(self.all_result)
        self.helper.check_make_folder(self.express_result)

    def _compare_cds_tran(self, gff_file, tran_file, log):
        '''compare CDS and transcript to find the expressed CDS'''
        log.write("Comparing transcripts and CDSs to get expressed CDSs.\n")
        out = open(os.path.join(self.out_all, "tmp_cds.gff"), "w")
        cdss = []
        fh = open(gff_file)
        th = open(tran_file)
        for entry in Gff3Parser().entries(fh):
            if entry.feature == "CDS":
                cdss.append(entry)
        trans = []
        for entry in Gff3Parser().entries(th):
            trans.append(entry)
        for cds in cdss:
            for ta in trans:
                if (cds.strand == ta.strand) and (cds.seq_id == ta.seq_id):
                    if ((cds.end < ta.end) and (cds.end > ta.start) and
                        (cds.start <= ta.start)) or (
                            (cds.start > ta.start) and (cds.start < ta.end) and
                            (cds.end >= ta.end)) or (
                                (cds.end >= ta.end) and
                                (cds.start <= ta.start)) or (
                                    (cds.end <= ta.end) and
                                    (cds.start >= ta.start)):
                        out.write(cds.info + "\n")
                        break
        fh.close()
        th.close()
        out.close()
        log.write("\t" + os.path.join(self.out_all, "tmp_cds.gff") + " is "
                  "temporary generated.\n")

    def _get_protein_seq(self, gff, tmp_path, tran_path, args_sub, log):
        prefix = gff.replace(".gff", "")
        fasta = self.helper.get_correct_file(self.fasta_path, ".fa", prefix,
                                             None, None)
        dna_seq_file = os.path.join(tmp_path, "_".join([prefix, "dna.fa"]))
        print("Generating CDS fasta files of {0}".format(prefix))
        if tran_path is not None:
            log.write("Predicting subcellular localization for expressed "
                      "CDSs for {0}.\n".format(prefix))
            self._compare_cds_tran(
                os.path.join(self.gff_path, gff),
                os.path.join(tran_path, "_".join([prefix, "transcript.gff"])),
                log)
            log.write("Running helper.py to extract sequences for CDSs.\n")
            self.helper.get_cds_seq(os.path.join(self.out_all, "tmp_cds.gff"),
                                    fasta, dna_seq_file)
            os.remove(os.path.join(self.out_all, "tmp_cds.gff"))
        else:
            log.write("Predicting subcellular localization for all CDSs for "
                      "{0}.\n".format(prefix))
            log.write("Running helper.py to extract sequences for CDSs.\n")
            self.helper.get_cds_seq(os.path.join(self.gff_path, gff), fasta,
                                    dna_seq_file)
        log.write("\t" + dna_seq_file + " is generated.\n")
        print("Transfering DNA sequences to protein sequence of {0}".format(
            prefix))
        log.write("Running helper.py to translate DNA sequences to Protein "
                  "sequences.\n")
        tmp_file = os.path.join(args_sub.out_folder, "tmp")
        self.helper.translation(dna_seq_file, tmp_file)
        prot_seq_file = os.path.join(tmp_path, "_".join([prefix,
                                                         "protein.fa"]))
        self.fixer.fix_emboss(tmp_file, prot_seq_file)
        log.write(prot_seq_file + " is generated.\n")
        os.remove(tmp_file)
        return prefix

    def _psortb(self, psortb_path, strain_type, prot_seq_file, out_raw,
                out_err, log):
        log.write(" ".join([psortb_path, strain_type, prot_seq_file]) + "\n")
        call([psortb_path, strain_type, prot_seq_file],
             stdout=out_raw,
             stderr=out_err)

    def _run_psortb(self, args_sub, prefix, out_folder, tmp_path, tmp_result,
                    log):
        print("Running psortb of {0}".format(prefix))
        log.write("Running Psortb for predict subcellular localization for "
                  "{0}.\n".format(prefix))
        out_err = open(os.path.join(out_folder, "tmp_log"), "w")
        out_raw = open(
            os.path.join(tmp_result, "_".join([prefix, self.endfix_raw])), "w")
        prot_seq_file = os.path.join(tmp_path, "_".join([prefix,
                                                         "protein.fa"]))
        if args_sub.gram == "positive":
            self._psortb(args_sub.psortb_path, "-p", prot_seq_file, out_raw,
                         out_err, log)
        elif args_sub.gram == "negative":
            self._psortb(args_sub.psortb_path, "-n", prot_seq_file, out_raw,
                         out_err, log)
        else:
            log.write("Please assign \"positive\" or \"negative\" to "
                      "--bacteria_type.\n")
            print("Error: {0} is not a proper bacteria type! "
                  "Please assign positive or negative.".format(args_sub.gram))
            sys.exit()
        log.write(
            "\t" +
            os.path.join(tmp_result, "_".join([prefix, self.endfix_raw])) +
            " is temporary generated.\n")
        out_err.close()
        out_raw.close()

    def _extract_result(self, args_sub, tmp_psortb_path, prefix, gff_file,
                        log):
        '''extract the result of psortb'''
        log.write("Running extract_psortb.py to extract the information of "
                  "localization.\n")
        extract_psortb(
            os.path.join(tmp_psortb_path, "_".join([prefix, self.endfix_raw])),
            os.path.join(tmp_psortb_path,
                         "_".join([prefix, self.endfix_table])), None, None,
            args_sub.fuzzy)
        log.write("\t" + os.path.join(tmp_psortb_path, "_".join(
            [prefix, self.endfix_table])) + " is tempoaray generated.\n")

    def _remove_header(self, out_all):
        out = open(out_all + "_tmp", "w")
        fh = open(out_all, "r")
        out.write("\t".join([
            "#Genome", "Protein", "Strand", "Start", "End", "Location", "Score"
        ]) + "\n")
        for row in csv.reader(fh, delimiter='\t'):
            if row[0] != "#Genome":
                out.write("\t".join(row) + "\n")
        out.close()
        fh.close()
        shutil.move(out_all + "_tmp", out_all)

    def _merge_and_stat(self, gffs, tmp_psortb_path, stat_path, psortb_result,
                        log):
        for folder in os.listdir(gffs):
            if folder.endswith(".gff_folder"):
                prefix = folder.replace(".gff_folder", "")
                self.helper.check_make_folder(
                    os.path.join(psortb_result, prefix))
                merge_table = os.path.join(
                    psortb_result, prefix,
                    "_".join([prefix, self.endfix_table]))
                for gff in os.listdir(os.path.join(gffs, folder)):
                    result = self.helper.get_correct_file(
                        tmp_psortb_path, "_" + self.endfix_raw,
                        gff.replace(".gff", ""), None, None)
                    shutil.copy(result, os.path.join(psortb_result, prefix))
                    result = self.helper.get_correct_file(
                        tmp_psortb_path, "_" + self.endfix_table,
                        gff.replace(".gff", ""), None, None)
                    self.helper.merge_file(result, merge_table)
                log.write("\t" + merge_table + "\n")
                self._remove_header(merge_table)
                self.helper.check_make_folder(os.path.join(stat_path, prefix))
                stat_folder = os.path.join(stat_path, prefix)
                stat_file = os.path.join(
                    stat_folder, "_".join(["stat", prefix, "sublocal.csv"]))
                stat_sublocal(merge_table, os.path.join(stat_folder, prefix),
                              stat_file)
                for file_ in os.listdir(stat_folder):
                    log.write("\t" + os.path.join(stat_folder, file_) + "\n")

    def _remove_tmps(self, args_sub):
        self.helper.remove_tmp_dir(args_sub.fastas)
        self.helper.remove_tmp_dir(args_sub.gffs)
        self.helper.remove_all_content(args_sub.out_folder, "tmp", "dir")
        self.helper.remove_all_content(self.out_all, "tmp", "dir")
        self.helper.remove_all_content(self.out_express, "tmp", "dir")
        os.remove(os.path.join(self.out_all, "tmp_log"))
        if args_sub.trans is not None:
            os.remove(os.path.join(self.out_express, "tmp_log"))
            self.helper.remove_tmp_dir(args_sub.trans)

    def run_sub_local(self, args_sub, log):
        for gff in os.listdir(args_sub.gffs):
            if gff.endswith(".gff"):
                self.helper.check_uni_attributes(
                    os.path.join(args_sub.gffs, gff))
        self.multiparser.parser_gff(args_sub.gffs, None)
        self.multiparser.parser_fasta(args_sub.fastas)
        if args_sub.trans is not None:
            self.multiparser.parser_gff(args_sub.trans, "transcript")
            self.helper.check_make_folder(self.express_tmp_path)
            self.helper.check_make_folder(self.express_tmp_result)
        self.helper.check_make_folder(self.all_tmp_path)
        self.helper.check_make_folder(self.all_tmp_result)
        for gff in os.listdir(self.gff_path):
            if args_sub.trans is not None:
                print("Running expressed genes now")
                prefix = self._get_protein_seq(gff, self.express_tmp_path,
                                               self.tran_path, args_sub, log)
                self._run_psortb(args_sub, prefix, self.out_express,
                                 self.express_tmp_path,
                                 self.express_tmp_result, log)
                self._extract_result(args_sub, self.express_tmp_result, prefix,
                                     os.path.join(self.gff_path, gff), log)
            print("Running all genes now")
            prefix = self._get_protein_seq(gff, self.all_tmp_path, None,
                                           args_sub, log)
            self._run_psortb(args_sub, prefix, self.out_all, self.all_tmp_path,
                             self.all_tmp_result, log)
            self._extract_result(args_sub, self.all_tmp_result, prefix,
                                 os.path.join(self.gff_path, gff), log)
        log.write("Running stat_sublocal.py to do statistics, generate "
                  "merged tables, and plot figures.\n")
        log.write("The following files are generated:\n")
        self._merge_and_stat(args_sub.gffs, self.all_tmp_result,
                             self.all_stat_path, self.all_result, log)
        if args_sub.trans is not None:
            self._merge_and_stat(args_sub.gffs, self.express_tmp_result,
                                 self.express_stat_path, self.express_result,
                                 log)
        self._remove_tmps(args_sub)
示例#20
0
class RATT(object):
    '''annotation transfer'''
    def __init__(self, args_ratt):
        self.multiparser = Multiparser()
        self.converter = Converter()
        self.format_fixer = FormatFixer()
        self.helper = Helper()
        if args_ratt.ref_gbk:
            self.gbk = os.path.join(args_ratt.ref_gbk, "gbk_tmp")
            self.gbk_tmp = os.path.join(self.gbk, "tmp")
            self.embl = os.path.join(args_ratt.ref_gbk, "embls")
        if args_ratt.ref_embls:
            self.embl = args_ratt.ref_embls
        self.ratt_log = os.path.join(args_ratt.output_path, "ratt_log.txt")
        self.tmp_files = {
            "tar": os.path.join(args_ratt.tar_fastas, "tmp"),
            "ref": os.path.join(args_ratt.ref_fastas, "tmp"),
            "out_gff": os.path.join(args_ratt.gff_outfolder, "tmp"),
            "gff": os.path.join(args_ratt.gff_outfolder, "tmp.gff"),
            "ptt": os.path.join(args_ratt.gff_outfolder, "tmp.ptt"),
            "rnt": os.path.join(args_ratt.gff_outfolder, "tmp.rnt")
        }

    def _convert_to_pttrnt(self, gffs, files, log):
        for gff in files:
            if gff.endswith(".gff"):
                gff = os.path.join(gffs, gff)
                filename = gff.split("/")
                prefix = filename[-1][:-4]
                rnt = gff[:-3] + "rnt"
                ptt = gff[:-3] + "ptt"
                fasta = self.helper.get_correct_file(self.tmp_files["tar"],
                                                     ".fa", prefix, None, None)
                if fasta:
                    self.converter.convert_gff2rntptt(gff, fasta, ptt, rnt,
                                                      None, None)
                    log.write("\t" + ptt + " is generated.\n")
                    log.write("\t" + rnt + " is generated.\n")

    def _remove_files(self, args_ratt, out_gbk, log):
        self.helper.remove_all_content(args_ratt.gff_outfolder, ".gff", "file")
        self.helper.remove_all_content(args_ratt.gff_outfolder, ".ptt", "file")
        self.helper.remove_all_content(args_ratt.gff_outfolder, ".rnt", "file")
        log.write("Moving the final output files to {0}.\n".format(
            args_ratt.gff_outfolder))
        self.helper.move_all_content(self.tmp_files["out_gff"],
                                     args_ratt.gff_outfolder, None)
        log.write("Remove the temperary files.\n")
        shutil.rmtree(self.tmp_files["out_gff"])
        shutil.rmtree(self.tmp_files["tar"])
        shutil.rmtree(self.tmp_files["ref"])
        self.helper.remove_tmp_dir(args_ratt.tar_fastas)
        self.helper.remove_tmp_dir(args_ratt.ref_fastas)
        self.helper.remove_tmp_dir(args_ratt.ref_embls)
        self.helper.remove_tmp_dir(args_ratt.ref_gbk)

    def _convert_to_gff(self, ratt_result, args_ratt, files, log):
        name = ratt_result.split(".")
        filename = ".".join(name[1:-2]) + ".gff"
        output_file = os.path.join(args_ratt.output_path, filename)
        self.converter.convert_embl2gff(
            os.path.join(args_ratt.output_path, ratt_result), output_file)
        self.format_fixer.fix_ratt(output_file, ".".join(name[1:-2]),
                                   "tmp_gff")
        shutil.move("tmp_gff", output_file)
        shutil.copy(output_file, os.path.join(args_ratt.gff_outfolder,
                                              filename))
        log.write("\t" + os.path.join(args_ratt.gff_outfolder, filename) +
                  " is generated.\n")
        files.append(filename)

    def _parser_embl_gbk(self, files):
        self.helper.check_make_folder(self.gbk)
        for file_ in files:
            close = False
            with open(file_, "r") as f_h:
                for line in f_h:
                    if (line.startswith("LOCUS")):
                        out = open(self.gbk_tmp, "w")
                        datas = line.split(" ")
                        for data in datas:
                            if (len(data) != 0) and (data != "LOCUS"):
                                filename = ".".join([data.strip(), "gbk"])
                                break
                    elif (line.startswith("VERSION")):
                        datas = line.split(" ")
                        for data in datas:
                            if (len(data) != 0) and (data != "VERSION"):
                                new_filename = ".".join([data.strip(), "gbk"])
                                break
                        if new_filename.find(filename):
                            filename = new_filename
                    if out:
                        out.write(line)
                    if line.startswith("//"):
                        out.close()
                        close = True
                        shutil.move(self.gbk_tmp,
                                    os.path.join(self.gbk, filename))
            if not close:
                out.close()
        return self.gbk

    def _convert_embl(self, ref_embls, log):
        '''convert gbk to embl'''
        detect_gbk = False
        gbks = []
        out_gbk = None
        for embl in os.listdir(ref_embls):
            if (embl.endswith(".gbk")) or (embl.endswith(".gbff")) or (
                    embl.endswith(".gb")):
                detect_gbk = True
                gbks.append(os.path.join(ref_embls, embl))
        if not detect_gbk:
            log.write(
                "--related_gbk_files is assigned, but not gbk files are detected.\n"
                "The gbk file names need to be ended at .gbk, .gb, or .gbff. \n"
            )
            print("Error: Please assign proper Genebank files!")
            sys.exit()
        elif detect_gbk:
            out_gbk = self._parser_embl_gbk(gbks)
            log.write(
                "Running converter.py to convert gbk file to embl format.\n")
            self.converter.convert_gbk2embl(out_gbk)
            self.helper.check_make_folder(self.embl)
            self.helper.move_all_content(out_gbk, self.embl, [".embl"])
            log.write("\t" + self.embl +
                      " is generated and the embl files are stored in it.\n")
        return out_gbk

    def _run_ratt(self, args_ratt, tar, ref, out, log):
        if (not os.path.exists(self.embl)) or (not os.path.exists(
                os.path.join(self.tmp_files["tar"], tar + ".fa"))) or (
                    not os.path.exists(
                        os.path.join(self.tmp_files["ref"], ref + ".fa"))):
            print("Error: Please check --compare_pair, the strain names "
                  "should be the same as the strain names in fasta, "
                  "genbank or embl files!")
            log.write(
                "The strain names in --compare_pair should be the same "
                "as the strain names in fasta, genbank, or embl files.\n")
            sys.exit()
        log.write("Make sure your RATT version is at least 1.64.\n")
        log.write("If the RATT can not run properly, please check the "
                  "RATT_HOME and PAGIT_HOME is assigned correctly.\n")
        log.write(" ".join([
            args_ratt.ratt_path, self.embl,
            os.path.join(self.tmp_files["tar"], tar +
                         ".fa"), args_ratt.element, args_ratt.transfer_type,
            os.path.join(self.tmp_files["ref"], ref + ".fa")
        ]) + "\n")
        call([
            args_ratt.ratt_path, self.embl,
            os.path.join(self.tmp_files["tar"], tar + ".fa"),
            args_ratt.element, args_ratt.transfer_type,
            os.path.join(self.tmp_files["ref"], ref + ".fa")
        ],
             stdout=out,
             stderr=DEVNULL)
        log.write("Done!\n")

    def _format_and_run(self, args_ratt, log):
        print("Running RATT")
        for pair in args_ratt.pairs:
            ref = pair.split(":")[0]
            tar = pair.split(":")[1]
            out = open(self.ratt_log, "w+")
            self._run_ratt(args_ratt, tar, ref, out, log)
            log.write("The following files are generatd:\n")
            for filename in os.listdir():
                if ("final" in filename):
                    log.write("\t" + filename + "\n")
                    shutil.move(filename,
                                os.path.join(args_ratt.output_path, filename))
                elif (args_ratt.element in filename) or (
                        "query" in filename) or ("Reference" in filename) or (
                            "Query" in filename) or ("Sequences" in filename):
                    log.write("\t" + filename + "\n")
                    if os.path.isfile(filename):
                        os.remove(filename)
                    if os.path.isdir(filename):
                        shutil.rmtree(filename)
        out.close()

    def annotation_transfer(self, args_ratt, log):
        self.multiparser.parser_fasta(args_ratt.tar_fastas)
        self.multiparser.parser_fasta(args_ratt.ref_fastas)
        out_gbk = None
        if args_ratt.ref_embls is None:
            out_gbk = self._convert_embl(args_ratt.ref_gbki, log)
        self._format_and_run(args_ratt, log)
        files = []
        for data in os.listdir(args_ratt.output_path):
            if "final.embl" in data:
                log.write(
                    "Running converter.py to convert embl "
                    "files in {0} to gff, ptt, and rnt format.\n".format(data))
                self._convert_to_gff(data, args_ratt, files, log)
                self._convert_to_pttrnt(args_ratt.gff_outfolder, files, log)
        self.helper.check_make_folder(self.tmp_files["out_gff"])
        log.write("Merging the output of {0}.\n".format(data))
        for folder in os.listdir(args_ratt.tar_fastas):
            files = []
            if "_folder" in folder:
                datas = folder.split("_folder")
                prefix = ".".join(datas[0].split(".")[:-1])
                for file_ in os.listdir(
                        os.path.join(args_ratt.tar_fastas, folder)):
                    files.append(file_[:-3])
                for gff in os.listdir(args_ratt.gff_outfolder):
                    for file_ in files:
                        if (".gff" in gff) and (file_ == gff[:-4]):
                            self.helper.merge_file(
                                os.path.join(args_ratt.gff_outfolder, gff),
                                self.tmp_files["gff"])
                        if (".ptt" in gff) and (file_ == gff[:-4]):
                            self.helper.merge_file(
                                os.path.join(args_ratt.gff_outfolder, gff),
                                self.tmp_files["ptt"])
                        if (".rnt" in gff) and (file_ == gff[:-4]):
                            self.helper.merge_file(
                                os.path.join(args_ratt.gff_outfolder, gff),
                                self.tmp_files["rnt"])
                if os.path.exists(self.tmp_files["gff"]):
                    shutil.move(
                        self.tmp_files["gff"],
                        os.path.join(self.tmp_files["out_gff"],
                                     prefix + ".gff"))
                    shutil.move(
                        self.tmp_files["ptt"],
                        os.path.join(self.tmp_files["out_gff"],
                                     prefix + ".ptt"))
                    shutil.move(
                        self.tmp_files["rnt"],
                        os.path.join(self.tmp_files["out_gff"],
                                     prefix + ".rnt"))
                else:
                    print("Error: Please check your fasta or "
                          "annotation files, they should only contain "
                          "the query genome. And make sure your RATT can "
                          "work properly (check $ANNOgesic/output/"
                          "annotation_transfer/ratt_log.txt).")
                    log.write("Please check your fasta or "
                              "annotation files, they should only contain "
                              "the query genome. And make sure your RATT can "
                              "work properly (check $ANNOgesic/output/"
                              "annotation_transfer/ratt_log.txt).\n")
        self._remove_files(args_ratt, out_gbk, log)
示例#21
0
class RATT(object):
    '''annotation transfer'''
    def __init__(self, args_ratt):
        self.multiparser = Multiparser()
        self.converter = Converter()
        self.format_fixer = FormatFixer()
        self.helper = Helper()
        if args_ratt.ref_gbk:
            self.gbk = os.path.join(args_ratt.ref_gbk, "gbk_tmp")
            self.gbk_tmp = os.path.join(self.gbk, "tmp")
            self.embl = os.path.join(args_ratt.ref_gbk, "embls")
        if args_ratt.ref_embls:
            self.embl = args_ratt.ref_embls
        self.ratt_log = os.path.join(args_ratt.output_path, "ratt_log.txt")
        self.tmp_files = {
            "tar": os.path.join(args_ratt.tar_fastas, "tmp"),
            "ref": os.path.join(args_ratt.ref_fastas, "tmp"),
            "out_gff": os.path.join(args_ratt.gff_outfolder, "tmp"),
            "gff": os.path.join(args_ratt.gff_outfolder, "tmp.gff"),
            "ptt": os.path.join(args_ratt.gff_outfolder, "tmp.ptt"),
            "rnt": os.path.join(args_ratt.gff_outfolder, "tmp.rnt")
        }

    def _convert_to_pttrnt(self, gffs, files):
        for gff in files:
            if gff.endswith(".gff"):
                gff = os.path.join(gffs, gff)
                filename = gff.split("/")
                prefix = filename[-1][:-4]
                rnt = gff[:-3] + "rnt"
                ptt = gff[:-3] + "ptt"
                fasta = self.helper.get_correct_file(self.tmp_files["tar"],
                                                     ".fa", prefix, None, None)
                if fasta:
                    self.converter.convert_gff2rntptt(gff, fasta, ptt, rnt,
                                                      None, None)

    def _remove_files(self, args_ratt, out_gbk):
        self.helper.remove_all_content(args_ratt.gff_outfolder, ".gff", "file")
        self.helper.remove_all_content(args_ratt.gff_outfolder, ".ptt", "file")
        self.helper.remove_all_content(args_ratt.gff_outfolder, ".rnt", "file")
        self.helper.move_all_content(self.tmp_files["out_gff"],
                                     args_ratt.gff_outfolder, None)
        shutil.rmtree(self.tmp_files["out_gff"])
        shutil.rmtree(self.tmp_files["tar"])
        shutil.rmtree(self.tmp_files["ref"])
        self.helper.remove_tmp_dir(args_ratt.tar_fastas)
        self.helper.remove_tmp_dir(args_ratt.ref_fastas)
        self.helper.remove_tmp_dir(args_ratt.ref_embls)
        self.helper.remove_tmp_dir(args_ratt.ref_gbk)

    def _convert_to_gff(self, ratt_result, args_ratt, files):
        name = ratt_result.split(".")
        filename = ".".join(name[1:-2]) + ".gff"
        output_file = os.path.join(args_ratt.output_path, filename)
        self.converter.convert_embl2gff(
            os.path.join(args_ratt.output_path, ratt_result), output_file)
        self.format_fixer.fix_ratt(output_file, ".".join(name[1:-2]),
                                   "tmp_gff")
        shutil.move("tmp_gff", output_file)
        shutil.copy(output_file, os.path.join(args_ratt.gff_outfolder,
                                              filename))
        files.append(filename)

    def _parser_embl_gbk(self, files):
        self.helper.check_make_folder(self.gbk)
        for file_ in files:
            close = False
            with open(file_, "r") as f_h:
                for line in f_h:
                    if (line.startswith("LOCUS")):
                        out = open(self.gbk_tmp, "w")
                        datas = line.split(" ")
                        for data in datas:
                            if (len(data) != 0) and (data != "LOCUS"):
                                filename = ".".join([data, "gbk"])
                                break
                    elif (line.startswith("VERSION")):
                        datas = line.split(" ")
                        for data in datas:
                            if (len(data) != 0) and (data != "VERSION"):
                                new_filename = ".".join([data, "gbk"])
                                break
                        if new_filename.find(filename):
                            filename = new_filename
                    if out:
                        out.write(line)
                    if line.startswith("//"):
                        out.close()
                        close = True
                        shutil.move(self.gbk_tmp,
                                    os.path.join(self.gbk, filename))
            if not close:
                out.close()
        return self.gbk

    def _convert_embl(self, ref_embls):
        '''convert gbk to embl'''
        detect_gbk = False
        gbks = []
        out_gbk = None
        for embl in os.listdir(ref_embls):
            if (embl.endswith(".gbk")) or (embl.endswith(".gbff")) or (
                    embl.endswith(".gb")):
                detect_gbk = True
                gbks.append(os.path.join(ref_embls, embl))
        if not detect_gbk:
            print("Error: Please assign proper Genebank files!")
            sys.exit()
        elif detect_gbk:
            out_gbk = self._parser_embl_gbk(gbks)
            self.converter.convert_gbk2embl(out_gbk)
            self.helper.check_make_folder(self.embl)
            self.helper.move_all_content(out_gbk, self.embl, [".embl"])
        return out_gbk

    def _run_ratt(self, args_ratt, tar, ref, out):
        call([
            args_ratt.ratt_path, self.embl,
            os.path.join(self.tmp_files["tar"], tar + ".fa"),
            args_ratt.element, args_ratt.transfer_type,
            os.path.join(self.tmp_files["ref"], ref + ".fa")
        ],
             stdout=out,
             stderr=DEVNULL)

    def _format_and_run(self, args_ratt):
        print("Running RATT")
        for pair in args_ratt.pairs:
            ref = pair.split(":")[0]
            tar = pair.split(":")[1]
            out = open(self.ratt_log, "w+")
            self._run_ratt(args_ratt, tar, ref, out)
            for filename in os.listdir():
                if ("final" in filename):
                    shutil.move(filename,
                                os.path.join(args_ratt.output_path, filename))
                elif (args_ratt.element in filename) or (
                        "query" in filename) or ("Reference" in filename) or (
                            "Query" in filename) or ("Sequences" in filename):
                    if os.path.isfile(filename):
                        os.remove(filename)
                    if os.path.isdir(filename):
                        shutil.rmtree(filename)
        out.close()

    def annotation_transfer(self, args_ratt):
        self.multiparser.parser_fasta(args_ratt.tar_fastas)
        self.multiparser.parser_fasta(args_ratt.ref_fastas)
        out_gbk = None
        if args_ratt.ref_embls is None:
            out_gbk = self._convert_embl(args_ratt.ref_gbk)
        self._format_and_run(args_ratt)
        if args_ratt.convert:
            files = []
            for data in os.listdir(args_ratt.output_path):
                if "final.embl" in data:
                    self._convert_to_gff(data, args_ratt, files)
                    self._convert_to_pttrnt(args_ratt.gff_outfolder, files)
            self.helper.check_make_folder(self.tmp_files["out_gff"])
            for folder in os.listdir(args_ratt.tar_fastas):
                files = []
                if "_folder" in folder:
                    datas = folder.split("_folder")
                    prefix = ".".join(datas[0].split(".")[:-1])
                    for file_ in os.listdir(
                            os.path.join(args_ratt.tar_fastas, folder)):
                        files.append(file_[:-3])
                    for gff in os.listdir(args_ratt.gff_outfolder):
                        for file_ in files:
                            if (".gff" in gff) and (file_ == gff[:-4]):
                                self.helper.merge_file(
                                    os.path.join(args_ratt.gff_outfolder, gff),
                                    self.tmp_files["gff"])
                            if (".ptt" in gff) and (file_ == gff[:-4]):
                                self.helper.merge_file(
                                    os.path.join(args_ratt.gff_outfolder, gff),
                                    self.tmp_files["ptt"])
                            if (".rnt" in gff) and (file_ == gff[:-4]):
                                self.helper.merge_file(
                                    os.path.join(args_ratt.gff_outfolder, gff),
                                    self.tmp_files["rnt"])
                    if os.path.exists(self.tmp_files["gff"]):
                        shutil.move(
                            self.tmp_files["gff"],
                            os.path.join(self.tmp_files["out_gff"],
                                         prefix + ".gff"))
                        shutil.move(
                            self.tmp_files["ptt"],
                            os.path.join(self.tmp_files["out_gff"],
                                         prefix + ".ptt"))
                        shutil.move(
                            self.tmp_files["rnt"],
                            os.path.join(self.tmp_files["out_gff"],
                                         prefix + ".rnt"))
                    else:
                        print("Error: Please check your fasta or "
                              "annotation files, they should only contain "
                              "the query genome. And make sure your RATT can "
                              "work properly (check $ANNOgesic/output/"
                              "annotation_transfer/ratt_log.txt).")
        self._remove_files(args_ratt, out_gbk)
示例#22
0
class sRNATargetPrediction(object):

    def __init__(self, args_tar):
        self.multiparser = Multiparser()
        self.helper = Helper()
        self.fixer = FormatFixer()
        self.gff_parser = Gff3Parser()
        self.target_seq_path = os.path.join(args_tar.out_folder, "target_seqs")
        self.srna_seq_path = os.path.join(args_tar.out_folder, "sRNA_seqs")
        self.rnaplex_path = os.path.join(args_tar.out_folder, "RNAplex")
        self.rnaup_path = os.path.join(args_tar.out_folder, "RNAup")
        self.merge_path = os.path.join(args_tar.out_folder, "merge")
        self.srna_path = os.path.join(args_tar.srnas, "tmp")
        self.fasta_path = os.path.join(args_tar.fastas, "tmp")
        self.gff_path = os.path.join(args_tar.gffs, "tmp")
        self.tmps = {"tmp": "tmp", "rnaup": "tmp_rnaup", "log": "tmp_log",
                     "all_fa": "tmp*.fa", "all_txt": "tmp*.txt"}

    def _check_gff(self, gffs):
        for gff in os.listdir(gffs):
            if gff.endswith(".gff"):
                self.helper.check_uni_attributes(os.path.join(gffs, gff))

    def _run_rnaplfold(self, vienna_path, file_type, win_size, span,
                       unstr_region, seq_path, prefix, out_path):
        current = os.getcwd()
        os.chdir(out_path)
        command = " ".join([os.path.join(vienna_path, "RNAplfold"),
                            "-W", str(win_size),
                            "-L", str(span),
                            "-u", str(unstr_region),
                            "-O"])
        if file_type == "sRNA":
            os.system("<".join([command, os.path.join(current, seq_path,
                                "_".join([self.tmps["tmp"], prefix,
                                          file_type + ".fa"]))]))
        else:
            os.system("<".join([command, os.path.join(current, seq_path,
                                "_".join([prefix, file_type + ".fa"]))]))
        os.chdir(current)

    def _wait_process(self, processes):
        for p in processes:
            p.wait()
            if p.stdout:
                p.stdout.close()
            if p.stdin:
                p.stdin.close()
            if p.stderr:
                p.stderr.close()
            try:
                p.kill()
            except OSError:
                pass
            time.sleep(5)

    def _sort_srna_fasta(self, fasta, prefix, path):
        out = open(os.path.join(path,
                   "_".join([self.tmps["tmp"], prefix, "sRNA.fa"])), "w")
        srnas = []
        with open(fasta) as f_h:
            for line in f_h:
                line = line.strip()
                if line.startswith(">"):
                    name = line[1:]
                else:
                    srnas.append({"name": name, "seq": line, "len": len(line)})
        srnas = sorted(srnas, key=lambda x: (x["len"]))
        for srna in srnas:
            out.write(">" + srna["name"].split("|")[0] + "\n")
            out.write(srna["seq"] + "\n")
        out.close()

    def _read_fasta(self, fasta_file):
        seq = ""
        with open(fasta_file, "r") as seq_f:
            for line in seq_f:
                line = line.strip()
                if line.startswith(">"):
                    continue
                else:
                    seq = seq + line
        return seq

    def _get_specific_seq(self, srna_file, seq_file, srna_out, querys):
        for query in querys:
            srna_datas = query.split(":")
            srna = {"seq_id": srna_datas[0], "strand": srna_datas[1],
                    "start": int(srna_datas[2]), "end": int(srna_datas[3])}
            gff_f = open(srna_file, "r")
            out = open(srna_out, "a")
            seq = self._read_fasta(seq_file)
            num = 0
            for entry in self.gff_parser.entries(gff_f):
                if (entry.seq_id == srna["seq_id"]) and (
                        entry.strand == srna["strand"]) and (
                        entry.start == srna["start"]) and (
                        entry.end == srna["end"]):
                    if "ID" in entry.attributes.keys():
                        id_ = entry.attributes["ID"]
                    else:
                        id_ = entry.feature + str(num)
                    gene = self.helper.extract_gene(seq, entry.start,
                                                    entry.end, entry.strand)
                    out.write(">{0}|{1}|{2}|{3}|{4}\n{5}\n".format(
                              id_, entry.seq_id, entry.start,
                              entry.end, entry.strand, gene))
                    num += 1
            gff_f.close()
            out.close()

    def _gen_seq(self, prefixs, args_tar):
        print("Generating sRNA fasta files...")
        for srna in os.listdir(self.srna_path):
            if srna.endswith("_sRNA.gff"):
                prefix = srna.replace("_sRNA.gff", "")
                prefixs.append(prefix)
                srna_out = os.path.join(self.srna_seq_path,
                                        "_".join([prefix, "sRNA.fa"]))
                if "all" in args_tar.query:
                    self.helper.get_seq(
                            os.path.join(self.srna_path, srna),
                            os.path.join(self.fasta_path, prefix + ".fa"),
                            srna_out)
                else:
                    if "_".join([prefix, "sRNA.fa"]) in os.listdir(
                       self.srna_seq_path):
                        os.remove(srna_out)
                    self._get_specific_seq(
                            os.path.join(self.srna_path, srna),
                            os.path.join(self.fasta_path, prefix + ".fa"),
                            srna_out, args_tar.query)
                self._sort_srna_fasta(srna_out, prefix, self.srna_seq_path)
        print("Generating target fasta files...")
        for gff in os.listdir(self.gff_path):
            if gff.endswith(".gff"):
                prefix = gff.replace(".gff", "")
                potential_target(os.path.join(self.gff_path, gff),
                                 os.path.join(self.fasta_path, prefix + ".fa"),
                                 os.path.join(self.target_seq_path), args_tar)
                file_num = 1
                num = 0
                sub_prefix = os.path.join(self.target_seq_path,
                                          "_".join([prefix, "target"]))
                sub_out = open("_".join([sub_prefix, str(file_num) + ".fa"]),
                               "w")
                with open((sub_prefix + ".fa"), "r") as t_f:
                    for line in t_f:
                        line = line.strip()
                        if line.startswith(">"):
                            num += 1
                        if (num == 100):
                            num = 0
                            file_num += 1
                            sub_out.close()
                            sub_out = open("_".join([sub_prefix,
                                           str(file_num) + ".fa"]), "w")
                        sub_out.write(line + "\n")
                sub_out.close()

    def _run_rnaplex(self, prefix, rnaplfold_path, args_tar):
        print("Running RNAplex of {0}".format(prefix))
        num_process = 0
        processes = []
        for seq in os.listdir(self.target_seq_path):
            if (prefix in seq) and ("_target_" in seq):
                print("Running RNAplex with {0}".format(seq))
                out_rnaplex = open(os.path.join(
                    self.rnaplex_path, prefix, "_".join([
                        prefix, "RNAplex", str(num_process) + ".txt"])), "w")
                num_process += 1
                p = Popen([os.path.join(args_tar.vienna_path, "RNAplex"),
                           "-q", os.path.join(
                               self.srna_seq_path, "_".join([
                                   self.tmps["tmp"], prefix, "sRNA.fa"])),
                           "-t", os.path.join(self.target_seq_path, seq),
                           "-l", str(args_tar.inter_length),
                           "-e", str(args_tar.energy),
                           "-z", str(args_tar.duplex_dist),
                           "-a", rnaplfold_path], stdout=out_rnaplex)
                processes.append(p)
                if num_process % args_tar.core_plex == 0:
                    self._wait_process(processes)
        self._wait_process(processes)
        return num_process

    def _rna_plex(self, prefixs, args_tar):
        for prefix in prefixs:
            print("Running RNAplfold of {0}".format(prefix))
            self.helper.check_make_folder(
                        os.path.join(self.rnaplex_path, prefix))
            rnaplfold_path = os.path.join(self.rnaplex_path, prefix,
                                          "RNAplfold")
            os.mkdir(rnaplfold_path)
            self._run_rnaplfold(
                args_tar.vienna_path, "sRNA", args_tar.win_size_s,
                args_tar.span_s, args_tar.unstr_region_rnaplex_s,
                self.srna_seq_path, prefix, rnaplfold_path)
            self._run_rnaplfold(
                args_tar.vienna_path, "target", args_tar.win_size_t,
                args_tar.span_t, args_tar.unstr_region_rnaplex_t,
                self.target_seq_path, prefix, rnaplfold_path)
            num_process = self._run_rnaplex(prefix, rnaplfold_path, args_tar)
            rnaplex_file = os.path.join(self.rnaplex_path, prefix,
                                        "_".join([prefix, "RNAplex.txt"]))
            if ("_".join([prefix, "RNAplex.txt"]) in
                    os.listdir(os.path.join(self.rnaplex_path, prefix))):
                os.remove(rnaplex_file)
            for index in range(0, num_process):
                self.helper.merge_file(os.path.join(
                    self.rnaplex_path, prefix, "_".join([
                        prefix, "RNAplex", str(index) + ".txt"])),
                    rnaplex_file)
            self.helper.remove_all_content(os.path.join(
                 self.rnaplex_path, prefix), "_RNAplex_", "file")
            self.fixer.fix_rnaplex(rnaplex_file, self.tmps["tmp"])
            shutil.move(self.tmps["tmp"], rnaplex_file)

    def _run_rnaup(self, num_up, processes, out_rnaup, out_log, args_tar):
        for index in range(1, num_up + 1):
            out_tmp_up = open(os.path.join(
                args_tar.out_folder, "".join([self.tmps["rnaup"],
                                              str(index), ".txt"])), "w")
            out_err = open(os.path.join(
                args_tar.out_folder, "".join([self.tmps["log"],
                                              str(index), ".txt"])), "w")
            in_up = open(os.path.join(
                args_tar.out_folder, "".join([self.tmps["tmp"],
                                              str(index), ".fa"])), "r")
            p = Popen([os.path.join(args_tar.vienna_path, "RNAup"),
                       "-u", str(args_tar.unstr_region_rnaup),
                       "-o", "--interaction_first"],
                      stdin=in_up, stdout=out_tmp_up, stderr=out_err)
            processes.append(p)
        if len(processes) != 0:
            time.sleep(5)
            self._wait_process(processes)
            os.system("rm " + os.path.join(args_tar.out_folder,
                                           self.tmps["all_fa"]))
            self._merge_txt(num_up, out_rnaup, out_log, args_tar.out_folder)
            os.system("rm " + os.path.join(args_tar.out_folder,
                                           self.tmps["all_txt"]))

    def _merge_txt(self, num_up, out_rnaup, out_log, out_folder):
        for index in range(1, num_up + 1):
            self.helper.merge_file(
                os.path.join(out_folder, "".join([self.tmps["rnaup"],
                                                  str(index), ".txt"])),
                out_rnaup)
            self.helper.merge_file(
                os.path.join(out_folder, "".join([self.tmps["log"],
                                                  str(index), ".txt"])),
                out_log)

    def _get_continue(self, out_rnaup):
        srnas = []
        matchs = {}
        out = open("tmp.txt", "w")
        with open(out_rnaup) as f_h:
            for line in f_h:
                line = line.strip()
                if ">srna" in line:
                    srna = line[1:]
                    srnas.append(srna)
                    matchs[srna] = []
                else:
                    matchs[srna].append(line)
        srnas = srnas[:-1]
        for srna in srnas:
            out.write(">" + srna + "\n")
            for target in matchs[srna]:
                out.write(target + "\n")
        out.close()
        os.remove(out_rnaup)
        shutil.move("tmp.txt", out_rnaup)
        return srnas

    def _rnaup(self, prefixs, args_tar):
        for prefix in prefixs:
            srnas = []
            print("Running RNAup of {0}".format(prefix))
            if not os.path.exists(os.path.join(self.rnaup_path, prefix)):
                os.mkdir(os.path.join(self.rnaup_path, prefix))
            num_up = 0
            processes = []
            out_rnaup = os.path.join(self.rnaup_path, prefix,
                                     "_".join([prefix + "_RNAup.txt"]))
            out_log = os.path.join(self.rnaup_path, prefix,
                                   "_".join([prefix + "_RNAup.log"]))
            if "_".join([prefix, "RNAup.txt"]) in \
                    os.listdir(os.path.join(self.rnaup_path, prefix)):
                if not args_tar.continue_rnaup:
                    os.remove(out_rnaup)
                    os.remove(out_log)
                else:
                    srnas = self._get_continue(out_rnaup)
            with open(os.path.join(self.srna_seq_path, "_".join([
                    self.tmps["tmp"], prefix, "sRNA.fa"])), "r") as s_f:
                for line in s_f:
                    line = line.strip()
                    if line.startswith(">"):
                        if line[1:] in srnas:
                            start = False
                            continue
                        start = True
                        print("Running RNAup with {0}".format(line[1:]))
                        num_up += 1
                        out_up = open(os.path.join(args_tar.out_folder,
                                      "".join([self.tmps["tmp"],
                                               str(num_up), ".fa"])), "w")
                        out_up.write(line + "\n")
                    else:
                        if start:
                            out_up.write(line + "\n")
                            out_up.close()
                            self.helper.merge_file(os.path.join(
                                self.target_seq_path,
                                "_".join([prefix, "target.fa"])),
                                os.path.join(args_tar.out_folder,
                                             "".join([self.tmps["tmp"],
                                                      str(num_up), ".fa"])))
                            if num_up == args_tar.core_up:
                                self._run_rnaup(num_up, processes,
                                                out_rnaup, out_log, args_tar)
                                processes = []
                                num_up = 0
            self._run_rnaup(num_up, processes, out_rnaup, out_log, args_tar)

    def _merge_rnaplex_rnaup(self, prefixs, args_tar):
        for prefix in prefixs:
            rnaplex_file = None
            rnaup_file = None
            out_rnaplex = None
            out_rnaup = None
            self.helper.check_make_folder(os.path.join(
                                          self.merge_path, prefix))
            print("Ranking {0} now...".format(prefix))
            if (args_tar.program == "both") or (args_tar.program == "RNAplex"):
                rnaplex_file = os.path.join(self.rnaplex_path, prefix,
                                            "_".join([prefix, "RNAplex.txt"]))
                out_rnaplex = os.path.join(
                        self.rnaplex_path, prefix,
                        "_".join([prefix, "RNAplex_rank.csv"]))
            if (args_tar.program == "both") or (args_tar.program == "RNAup"):
                rnaup_file = os.path.join(self.rnaup_path, prefix,
                                          "_".join([prefix, "RNAup.txt"]))
                out_rnaup = os.path.join(self.rnaup_path, prefix,
                                         "_".join([prefix, "RNAup_rank.csv"]))
            merge_srna_target(rnaplex_file, rnaup_file, args_tar,
                              out_rnaplex, out_rnaup,
                              os.path.join(self.merge_path, prefix,
                                           "_".join([prefix, "merge.csv"])),
                              os.path.join(self.merge_path, prefix,
                                           "_".join([prefix, "overlap.csv"])),
                              os.path.join(self.srna_path,
                                           "_".join([prefix, "sRNA.gff"])),
                              os.path.join(self.gff_path, prefix + ".gff"))

    def run_srna_target_prediction(self, args_tar):
        self._check_gff(args_tar.gffs)
        self._check_gff(args_tar.srnas)
        self.multiparser.parser_gff(args_tar.gffs, None)
        self.multiparser.parser_fasta(args_tar.fastas)
        self.multiparser.parser_gff(args_tar.srnas, "sRNA")
        prefixs = []
        self._gen_seq(prefixs, args_tar)
        if (args_tar.program == "both") or (
                args_tar.program == "RNAplex"):
            self._rna_plex(prefixs, args_tar)
        self.helper.remove_all_content(self.target_seq_path,
                                       "_target_", "file")
        if (args_tar.program == "both") or (
                args_tar.program == "RNAup"):
            self._rnaup(prefixs, args_tar)
        self._merge_rnaplex_rnaup(prefixs, args_tar)
        if (args_tar.program == "RNAplex") or (
                args_tar.program == "both"):
            for strain in os.listdir(os.path.join(
                          args_tar.out_folder, "RNAplex")):
                shutil.rmtree(os.path.join(args_tar.out_folder, "RNAplex",
                                           strain, "RNAplfold"))
        self.helper.remove_all_content(args_tar.out_folder,
                                       self.tmps["tmp"], "dir")
        self.helper.remove_all_content(args_tar.out_folder,
                                       self.tmps["tmp"], "file")
        self.helper.remove_tmp(args_tar.gffs)
        self.helper.remove_tmp(args_tar.srnas)
        self.helper.remove_tmp(args_tar.fastas)
        self.helper.remove_all_content(self.srna_seq_path, "tmp_", "file")