示例#1
0
class sRNADetection(object):

    def __init__(self, args_srna):
        self.args_container = ArgsContainer()
        self.helper = Helper()
        self.multiparser = Multiparser()
        self.gff_output = os.path.join(args_srna.out_folder, "gffs")
        self.table_output = os.path.join(args_srna.out_folder, "tables")
        self.stat_path = os.path.join(args_srna.out_folder, "statistics")
        self.tss_path = self._check_folder_exist(args_srna.tss_folder)
        self.pro_path = self._check_folder_exist(args_srna.pro_folder)
        self.sorf_path = self._check_folder_exist(args_srna.sorf_file)
        self.fasta_path = os.path.join(args_srna.fastas, "tmp")
        self.tran_path = os.path.join(args_srna.trans, "tmp")
        self.term_path = self._check_folder_exist(args_srna.terms)
        self.merge_wigs = os.path.join(args_srna.out_folder, "merge_wigs")
        self.prefixs = {"merge": os.path.join(
                            args_srna.out_folder, "tmp_merge"),
                        "utr": os.path.join(
                            args_srna.out_folder, "tmp_utrsrna"),
                        "normal": os.path.join(
                            args_srna.out_folder, "tmp_normal"),
                        "in_cds": os.path.join(
                            args_srna.out_folder, "tmp_incds"),
                        "merge_table": os.path.join(
                            args_srna.out_folder, "tmp_merge_table"),
                        "utr_table": os.path.join(
                            args_srna.out_folder, "tmp_utrsrna_table"),
                        "normal_table": os.path.join(
                            args_srna.out_folder, "tmp_normal_table"),
                        "in_cds_table": os.path.join(
                            args_srna.out_folder, "tmp_incds_table"),
                        "basic": os.path.join(
                            args_srna.out_folder, "tmp_basic"),
                        "energy": os.path.join(
                            args_srna.out_folder, "tmp_energy")}
        self.tmps = {"nr": os.path.join(args_srna.out_folder, "tmp_nr"),
                     "srna": os.path.join(args_srna.out_folder, "tmp_sRNA")}
        self.best_table = os.path.join(self.table_output, "best")
        self.table_output = os.path.join(args_srna.out_folder, "tables")
        self.stat_path = os.path.join(args_srna.out_folder, "statistics")
        self.all_best = {"all_gff": os.path.join(
                             self.gff_output, "all_candidates"),
                         "best_gff": os.path.join(self.gff_output, "best"),
                         "all_table": os.path.join(
                             self.table_output, "all_candidates"),
                         "best_table": os.path.join(self.table_output, "best")}

    def _check_folder_exist(self, folder):
        if folder is not None:
            path = os.path.join(folder, "tmp")
        else:
            path = None
        return path

    def _check_gff(self, gffs):
        for gff in os.listdir(gffs):
            if gff.endswith(".gff"):
                self.helper.check_uni_attributes(os.path.join(gffs, gff))

    def _run_format(self, blast_path, database, type_, db_file, err):
        call([os.path.join(blast_path, "makeblastdb"), "-in", database,
              "-dbtype", type_, "-out", db_file], stderr=err)

    def _formatdb(self, database, type_, out_folder,
                  blast_path, database_type):
        err = open(os.path.join(out_folder, "log.txt"), "w")
        if (database.endswith(".fa")) or (
                database.endswith(".fna")) or (
                database.endswith(".fasta")):
            pass
        else:
            folders = database.split("/")
            filename = folders[-1]
            folder = "/".join(folders[:-1])
            for fasta in os.listdir(folder):
                if (fasta.endswith(".fa")) or (
                        fasta.endswith(".fna")) or (
                        fasta.endswith(".fasta")):
                    if ".".join(fasta.split(".")[:-1]) == filename:
                        database = os.path.join(folder, fasta)
        if database_type == "sRNA":
            change_format(database, "tmp_srna_database")
            os.remove(database)
            shutil.move("tmp_srna_database", database)
        db_file = ".".join(database.split(".")[:-1])
        self._run_format(blast_path, database, type_, db_file, err)
        err.close()

    def _merge_frag_tex_file(self, files, args_srna):
        if (args_srna.frag_wigs is not None) and (
                args_srna.tex_wigs is not None):
            self.helper.merge_file(files["frag_gff"], files["tex_gff"])
            self.helper.merge_file(files["frag_csv"], files["tex_csv"])
            shutil.move(files["tex_csv"], files["merge_csv"])
            self.helper.sort_gff(files["tex_gff"], files["merge_gff"])
            os.remove(files["frag_csv"])
            os.remove(files["frag_gff"])
            os.remove(files["tex_gff"])
        elif (args_srna.frag_wigs is not None):
            shutil.move(files["frag_csv"], files["merge_csv"])
            self.helper.sort_gff(files["frag_gff"], files["merge_gff"])
            os.remove(files["frag_gff"])
        elif (args_srna.tex_wigs is not None):
            shutil.move(files["tex_csv"], files["merge_csv"])
            self.helper.sort_gff(files["tex_gff"], files["merge_gff"])

    def _run_normal(self, prefix, gff, tran, fuzzy_tss, args_srna):
        if "tmp_cutoff_inter" in os.listdir(args_srna.out_folder):
            os.remove(os.path.join(args_srna.out_folder, "tmp_cutoff_inter"))
        files = {"frag_gff": None, "frag_csv": None,
                 "tex_gff": None, "tex_csv": None,
                 "merge_gff": None, "merge_csv": None}
        if ("tss" in args_srna.import_info):
            tss = self.helper.get_correct_file(self.tss_path, "_TSS.gff",
                                               prefix, None, None)
        else:
            tss = None
        if self.pro_path is not None:
            pro = self.helper.get_correct_file(
                    self.pro_path, "_processing.gff", prefix, None, None)
        else:
            pro = None
        if args_srna.frag_wigs is not None:
            files["frag_gff"] = os.path.join(
                    args_srna.out_folder, "_".join(["tmp_frag", prefix]))
            files["frag_csv"] = os.path.join(
                    args_srna.out_folder, "_".join(["tmp_frag_table", prefix]))

            args_srna = self.args_container.container_intersrna(
                             "frag", files, args_srna, prefix,
                             os.path.join(args_srna.gffs, gff), tran, tss,
                             pro, fuzzy_tss)
            intergenic_srna(args_srna)
        if args_srna.tex_wigs is not None:
            files["tex_gff"] = os.path.join(
                    args_srna.out_folder, "_".join(["tmp_tex", prefix]))
            files["tex_csv"] = os.path.join(
                    args_srna.out_folder, "_".join(["tmp_tex_table", prefix]))
            args_srna = self.args_container.container_intersrna(
                           "tex", files, args_srna, prefix,
                           os.path.join(args_srna.gffs, gff), tran, tss,
                           pro, fuzzy_tss)
            intergenic_srna(args_srna)
        files["merge_csv"] = "_".join([self.prefixs["normal_table"], prefix])
        files["merge_gff"] = "_".join([self.prefixs["normal"], prefix])
        self._merge_frag_tex_file(files, args_srna)
        if "TSS_class" in os.listdir(args_srna.out_folder):
            tss = os.path.join(args_srna.out_folder,
                               "TSS_class", prefix + "_TSS.gff")
        return tss

    def _run_utrsrna(self, gff, tran, prefix, tss, pro, args_srna):
        if "tmp_median" in os.listdir(args_srna.out_folder):
            os.remove(os.path.join(args_srna.out_folder, "tmp_median"))
        files = {"frag_gff": None, "frag_csv": None,
                 "tex_gff": None, "tex_csv": None,
                 "merge_gff": None, "merge_csv": None}
        if args_srna.tex_wigs is not None:
            files["tex_gff"] = os.path.join(
                    args_srna.out_folder, "_".join(["tmp_utr_tex", prefix]))
            files["tex_csv"] = os.path.join(
                    args_srna.out_folder,
                    "_".join(["tmp_utr_tex_table", prefix]))
            args_srna = self.args_container.container_utrsrna(
                    os.path.join(args_srna.gffs, gff), tran, tss, files,
                    pro, os.path.join(self.fasta_path, prefix + ".fa"),
                    "tex", prefix, args_srna)
            utr_derived_srna(args_srna)
        if args_srna.frag_wigs is not None:
            files["frag_gff"] = os.path.join(
                args_srna.out_folder, "_".join(["tmp_utr_frag", prefix]))
            files["frag_csv"] = os.path.join(
                args_srna.out_folder, "_".join(["tmp_utr_frag_table", prefix]))
            args_srna = self.args_container.container_utrsrna(
                    os.path.join(args_srna.gffs, gff), tran, tss, files,
                    pro, os.path.join(self.fasta_path, prefix + ".fa"),
                    "frag", prefix, args_srna)
            utr_derived_srna(args_srna)
        files["merge_csv"] = "_".join([self.prefixs["utr_table"], prefix])
        files["merge_gff"] = "_".join([self.prefixs["utr"], prefix])
        self._merge_frag_tex_file(files, args_srna)
        filter_utr(files["merge_gff"], files["merge_csv"], args_srna.min_utr)

    def _check_necessary_file(self, args_srna):
        if (args_srna.gffs is None) or (args_srna.trans is None) or (
                (args_srna.tex_wigs is None) and (
                args_srna.frag_wigs is None)):
            print("Error: lack required files!!!!")
            sys.exit()
        if args_srna.utr_srna:
            if (args_srna.tss_folder is None):
                print("Error: lack required TSS files for UTR "
                      "derived sRNA detection!!!!")
                sys.exit()
            if (args_srna.pro_folder is None):
                print("Warning: lack Processing site files for UTR "
                      "derived sRNA detection!!!")
                print("it may effect the results!!!!")
        self._check_gff(args_srna.gffs)
        self._check_gff(args_srna.trans)
        if args_srna.tss_folder is not None:
            self._check_gff(args_srna.tss_folder)
            self.multiparser.parser_gff(args_srna.tss_folder, "TSS")
            self.multiparser.combine_gff(args_srna.gffs, self.tss_path,
                                         None, "TSS")
        if args_srna.pro_folder is not None:
            self._check_gff(args_srna.pro_folder)
            self.multiparser.parser_gff(args_srna.pro_folder, "processing")
            self.multiparser.combine_gff(args_srna.gffs, self.pro_path,
                                         None, "processing")
        if args_srna.sorf_file is not None:
            self._check_gff(args_srna.sorf_file)
            self.multiparser.parser_gff(args_srna.sorf_file, "sORF")
            self.multiparser.combine_gff(args_srna.gffs, self.sorf_path,
                                         None, "sORF")
        if args_srna.utr_srna or ("sec_str" in args_srna.import_info) or (
           "blast_nr" in args_srna.import_info) or (
           "blast_srna" in args_srna.import_info):
            if args_srna.fastas is None:
                print("Error: lack required fasta files for UTR "
                      "derived sRNA detection!!!!")
                sys.exit()
            self.multiparser.parser_fasta(args_srna.fastas)
            self.multiparser.combine_fasta(args_srna.gffs,
                                           self.fasta_path, None)
        if args_srna.terms is not None:
            self._check_gff(args_srna.terms)
            self.multiparser.parser_gff(args_srna.terms, "term")
            self.multiparser.combine_gff(args_srna.gffs, self.term_path,
                                         None, "term")
        else:
            self.term_path = None

    def _run_program(self, args_srna):
        prefixs = []
        tss = None
        for gff in os.listdir(args_srna.gffs):
            if gff.endswith(".gff"):
                prefix = gff.replace(".gff", "")
                prefixs.append(prefix)
                print("Running sRNA detection of {0}....".format(prefix))
                tran = self.helper.get_correct_file(
                        self.tran_path, "_transcript.gff", prefix, None, None)
                gffs = {"merge": "_".join([self.prefixs["merge"], prefix]),
                        "utr": "_".join([self.prefixs["utr"], prefix]),
                        "normal": "_".join([self.prefixs["normal"], prefix])}
                csvs = {"merge": "_".join([
                            self.prefixs["merge_table"], prefix]),
                        "utr": "_".join([self.prefixs["utr_table"], prefix]),
                        "normal": "_".join([
                            self.prefixs["normal_table"], prefix])}
                tss = self._run_normal(
                        prefix, gff, tran, args_srna.fuzzy_tsss["inter"],
                        args_srna)
                if args_srna.utr_srna:
                    print("Running UTR derived sRNA detection of {0}".format(
                          prefix))
                    if tss is None:
                        tss = self.helper.get_correct_file(
                                self.tss_path, "_TSS.gff", prefix, None, None)
                    if self.pro_path is not None:
                        pro = self.helper.get_correct_file(
                                self.pro_path, "_processing.gff",
                                prefix, None, None)
                    else:
                        pro = None
                    if tss is not None:
                        self._run_utrsrna(gff, tran, prefix,
                                          tss, pro, args_srna)
                self._merge_srna(args_srna, gffs, csvs, prefix,
                                 os.path.join(args_srna.gffs, gff), tss)
                filter_frag(csvs["merge"], gffs["merge"])
                self.helper.sort_gff(gffs["merge"],
                                     "_".join([self.prefixs["basic"], prefix]))
        return prefixs

    def _merge_srna(self, args_srna, gffs, csvs, prefix, gff_file, tss):
        print("merging data of intergenic and UTR_derived sRNA...")
        merge_srna_gff(gffs, args_srna.in_cds,
                       args_srna.cutoff_overlap, gff_file)
        merge_srna_table(gffs["merge"], csvs, os.path.join(args_srna.wig_path,
                         "_".join([prefix, "forward.wig"])),
                         os.path.join(args_srna.wig_path,
                         "_".join([prefix, "reverse.wig"])),
                         tss, args_srna)

    def _run_RNAfold(self, seq_file, vienna_path, sec_file):
        os.system(" ".join(["cat", seq_file, "|",
                  os.path.join(vienna_path, "RNAfold"),
                  "-p", ">", sec_file]))

    def _get_seq_sec(self, fasta_path, out_folder, prefix, sec_path,
                     dot_path, vienna_path):
        detect = False
        for fasta in os.listdir(fasta_path):
            if fasta.endswith(".fa") and (
               fasta.replace(".fa", "") == prefix):
                detect = True
                break
        if detect:
            detect = False
            seq_file = os.path.join(out_folder, "_".join(["sRNA_seq", prefix]))
            sec_file = os.path.join(out_folder, "_".join(["sRNA_2d", prefix]))
            self.helper.get_seq("_".join([self.prefixs["basic"], prefix]),
                                os.path.join(fasta_path, fasta), seq_file)
        else:
            print("Error:There is not fasta file of {0}".format(prefix))
            print("please check your imported information")
            sys.exit()
        tmp_path = os.path.join(out_folder, "tmp_srna")
        self.helper.check_make_folder(tmp_path)
        main_path = os.getcwd()
        os.chdir(tmp_path)
        sec_file = os.path.join(main_path, sec_file)
        seq_file = os.path.join(main_path, seq_file)
        tmp_sec_path = os.path.join(main_path, sec_path)
        tmp_dot_path = os.path.join(main_path, dot_path)
        self._run_RNAfold(seq_file, vienna_path, sec_file)
        extract_energy(os.path.join(main_path,
                       "_".join([self.prefixs["basic"], prefix])),
                       sec_file, os.path.join(main_path,
                       "_".join([self.prefixs["energy"], prefix])))
        for ps in os.listdir(os.getcwd()):
            new_ps = ps.replace("|", "_")
            shutil.move(ps, new_ps)
        return {"sec": tmp_sec_path, "dot": tmp_dot_path, "main": main_path,
                "tmp": os.path.join(main_path, tmp_path)}

    def _run_replot(self, vienna_util, tmp_paths, file_, dot_file, rel_file):
        os.system(" ".join([os.path.join(vienna_util, "relplot.pl"),
                  os.path.join(tmp_paths["tmp"], file_),
                  os.path.join(tmp_paths["tmp"], dot_file),
                  ">", os.path.join(tmp_paths["tmp"], rel_file)]))

    def _convert_pdf(self, ps2pdf14_path, tmp_paths, file_, pdf_file):
        call([ps2pdf14_path, os.path.join(tmp_paths["tmp"], file_), pdf_file])

    def _replot_sec_to_pdf(self, vienna_util, tmp_paths,
                           ps2pdf14_path, prefix):
        for file_ in os.listdir(os.getcwd()):
            if file_.endswith("ss.ps"):
                dot_file = file_.replace("ss.ps", "dp.ps")
                rel_file = file_.replace("ss.ps", "rss.ps")
                print("replot {0}".format(file_))
                self._run_replot(vienna_util, tmp_paths, file_,
                                 dot_file, rel_file)
        for file_ in os.listdir(tmp_paths["tmp"]):
            if (file_.endswith("rss.ps")) or (file_.endswith("dp.ps")):
                pdf_file = file_.replace(".ps", ".pdf")
                print("convert {0} to pdf".format(file_))
                self._convert_pdf(ps2pdf14_path, tmp_paths,
                                  file_, pdf_file)
        os.mkdir(os.path.join(tmp_paths["sec"], prefix))
        os.mkdir(os.path.join(tmp_paths["dot"], prefix))
        self.helper.move_all_content(
                tmp_paths["tmp"], os.path.join(tmp_paths["sec"], prefix),
                ["rss.pdf"])
        self.helper.move_all_content(
                tmp_paths["tmp"], os.path.join(tmp_paths["dot"], prefix),
                ["dp.pdf"])

    def _run_mountain(self, vienna_util, tmp_paths, dot_file, out):
        call([os.path.join(vienna_util, "mountain.pl"),
              os.path.join(tmp_paths["tmp"], dot_file)], stdout=out)

    def _plot_mountain(self, mountain, moun_path,
                       tmp_paths, prefix, vienna_util):
        if mountain:
            tmp_moun_path = os.path.join(tmp_paths["main"], moun_path)
            os.mkdir(os.path.join(tmp_moun_path, prefix))
            txt_path = os.path.join(tmp_paths["tmp"], "tmp_txt")
            self.helper.check_make_folder(txt_path)
            print("Generating mountain plot of {0}....".format(prefix))
            for dot_file in os.listdir(tmp_paths["tmp"]):
                if dot_file.endswith("dp.ps"):
                    moun_txt = os.path.join(tmp_paths["tmp"], "mountain.txt")
                    out = open(moun_txt, "w")
                    moun_file = dot_file.replace("dp.ps", "mountain.pdf")
                    print("Generating {0}".format(moun_file))
                    self._run_mountain(vienna_util, tmp_paths, dot_file, out)
                    plot_mountain_plot(moun_txt, moun_file)
                    shutil.move(moun_file,
                                os.path.join(tmp_moun_path, prefix, moun_file))
                    out.close()
                    os.remove(moun_txt)

    def _compute_2d_and_energy(self, args_srna, prefixs):
        print("Running energy calculation....")
        moun_path = os.path.join(args_srna.out_folder, "mountain_plot")
        sec_path = os.path.join(args_srna.out_folder, "sec_structure",
                                "sec_plot")
        dot_path = os.path.join(args_srna.out_folder, "sec_structure",
                                "dot_plot")
        self.helper.remove_all_content(sec_path, None, "dir")
        self.helper.remove_all_content(dot_path, None, "dir")
        self.helper.remove_all_content(moun_path, None, "dir")
        for prefix in prefixs:
            tmp_paths = self._get_seq_sec(
                    self.fasta_path, args_srna.out_folder, prefix, sec_path,
                    dot_path, args_srna.vienna_path)
            self._replot_sec_to_pdf(args_srna.vienna_util, tmp_paths,
                                    args_srna.ps2pdf14_path, prefix)
            self._plot_mountain(args_srna.mountain, moun_path, tmp_paths,
                                prefix, args_srna.vienna_util)
            self.helper.remove_all_content(os.getcwd(), ".ps", "file")
            os.chdir(tmp_paths["main"])
            shutil.move("_".join([self.prefixs["energy"], prefix]),
                        "_".join([self.prefixs["basic"], prefix]))
            shutil.rmtree(os.path.join(args_srna.out_folder, "tmp_srna"))

    def _run_blast(self, blast_path, program, database, e, seq_file,
                   blast_file, strand):
        call([os.path.join(blast_path, program), "-db", database,
              "-evalue", str(e), "-strand", strand, "-query", seq_file,
              "-out", blast_file])

    def _get_strand_fasta(self, seq_file, out_folder):
        tmp_plus = os.path.join(out_folder, "tmp_plus.fa")
        tmp_minus = os.path.join(out_folder, "tmp_minus.fa")
        out_p = open(tmp_plus, "w")
        out_m = open(tmp_minus, "w")
        strand = ""
        with open(seq_file) as sh:
            for line in sh:
                line = line.strip()
                if line.startswith(">"):
                    if line[-1] == "+":
                        out_p.write(line + "\n")
                        strand = "plus"
                    elif line[-1] == "-":
                        out_m.write(line + "\n")
                        strand = "minus"
                else:
                    if strand == "plus":
                        out_p.write(line + "\n")
                    elif strand == "minus":
                        out_m.write(line + "\n")
        out_p.close()
        out_m.close()
        return tmp_plus, tmp_minus

    def _blast(self, database, database_format, data_type, args_srna,
               prefixs, program, database_type, e):
        if (database is None):
            print("Error: No database assigned!")
        else:
            if database_format:
                self._formatdb(database, data_type, args_srna.out_folder,
                               args_srna.blast_path, database_type)
            for prefix in prefixs:
                blast_file = os.path.join(
                        args_srna.out_folder, "blast_result_and_misc",
                        "_".join([database_type, "blast", prefix + ".txt"]))
                srna_file = "_".join([self.prefixs["basic"], prefix])
                out_file = os.path.join(
                        args_srna.out_folder,
                        "_".join(["tmp", database_type, prefix]))
                print("Running Blast of {0}".format(prefix))
                seq_file = os.path.join(
                        args_srna.out_folder, "_".join(["sRNA_seq", prefix]))
                if seq_file not in os.listdir(args_srna.out_folder):
                    self.helper.get_seq(
                            srna_file,
                            os.path.join(self.fasta_path, prefix + ".fa"),
                            seq_file)
                if database_type == "nr":
                    tmp_plus, tmp_minus = self._get_strand_fasta(
                            seq_file, args_srna.out_folder)
                    tmp_blast = os.path.join("tmp_blast.txt")
                    self._run_blast(args_srna.blast_path, program, database, e,
                                    tmp_plus, tmp_blast, "plus")
                    self._run_blast(args_srna.blast_path, program, database, e,
                                    tmp_minus, blast_file, "minus")
                    self.helper.merge_file(tmp_blast, blast_file)
                    os.remove(tmp_blast)
                    os.remove(tmp_plus)
                    os.remove(tmp_minus)
                else:
                    self._run_blast(args_srna.blast_path, program, database, e,
                                    seq_file, blast_file, "both")
                extract_blast(blast_file, srna_file, out_file,
                              out_file + ".csv", database_type)
                shutil.move(out_file, srna_file)

    def _class_srna(self, prefixs, args_srna):
        if (len(args_srna.import_info) != 1) or (
                len(args_srna.import_info) != 0):
            for prefix in prefixs:
                print("classifying sRNA of {0}".format(prefix))
                class_gff = os.path.join(self.gff_output, "for_class")
                class_table = os.path.join(self.table_output, "for_class")
                self.helper.check_make_folder(os.path.join(class_table,
                                                           prefix))
                self.helper.check_make_folder(os.path.join(class_gff, prefix))
                class_gff = os.path.join(class_gff, prefix)
                class_table = os.path.join(class_table, prefix)
                self.helper.check_make_folder(class_table)
                self.helper.check_make_folder(class_gff)
                out_stat = os.path.join(
                        self.stat_path, "_".join([
                            "stat_sRNA_class", prefix + ".csv"]))
                classify_srna(os.path.join(self.all_best["all_gff"],
                              "_".join([prefix, "sRNA.gff"])), class_gff,
                              out_stat, args_srna)
                for srna in os.listdir(class_gff):
                    out_table = os.path.join(
                            class_table, srna.replace(".gff", ".csv"))
                    gen_srna_table(
                        os.path.join(class_gff, srna),
                        "_".join([self.prefixs["merge_table"], prefix]),
                        "_".join([self.tmps["nr"], prefix + ".csv"]),
                        "_".join([self.tmps["srna"], prefix + ".csv"]),
                        args_srna, out_table)

    def _get_best_result(self, prefixs, args_srna):
        for prefix in prefixs:
            best_gff = os.path.join(self.all_best["best_gff"],
                                    "_".join([prefix, "sRNA.gff"]))
            best_table = os.path.join(self.all_best["best_table"],
                                      "_".join([prefix, "sRNA.csv"]))
            gen_best_srna(os.path.join(self.all_best["all_gff"],
                                       "_".join([prefix, "sRNA.gff"])),
                          best_gff, args_srna)
            gen_srna_table(os.path.join(self.all_best["best_gff"],
                           "_".join([prefix, "sRNA.gff"])),
                           "_".join([self.prefixs["merge_table"], prefix]),
                           "_".join([self.tmps["nr"], prefix + ".csv"]),
                           "_".join([self.tmps["srna"], prefix + ".csv"]),
                           args_srna, best_table)

    def _remove_file(self, args_srna):
        self.helper.remove_all_content(args_srna.out_folder, "tmp_", "dir")
        self.helper.remove_all_content(args_srna.out_folder, "tmp_", "file")
        self.helper.remove_tmp(args_srna.fastas)
        self.helper.remove_tmp(args_srna.gffs)
        if args_srna.frag_wigs is not None:
            self.helper.remove_tmp(args_srna.frag_wigs)
        if args_srna.tex_wigs is not None:
            self.helper.remove_tmp(args_srna.tex_wigs)
        if (args_srna.frag_wigs is not None) and (
                args_srna.tex_wigs is not None):
            shutil.rmtree(args_srna.merge_wigs)
        self.helper.remove_tmp(args_srna.trans)
        if args_srna.tss_folder is not None:
            self.helper.remove_tmp(args_srna.tss_folder)
        if args_srna.pro_folder is not None:
            self.helper.remove_tmp(args_srna.pro_folder)
        if args_srna.sorf_file is not None:
            self.helper.remove_tmp(args_srna.sorf_file)
        if "tmp_median" in os.listdir(args_srna.out_folder):
            os.remove(os.path.join(args_srna.out_folder, "tmp_median"))
        if self.term_path is not None:
            self.helper.remove_tmp(args_srna.terms)

    def _filter_srna(self, args_srna, prefixs):
        if "sec_str" in args_srna.import_info:
            self._compute_2d_and_energy(args_srna, prefixs)
        if "blast_nr" in args_srna.import_info:
            self._blast(args_srna.nr_database, args_srna.nr_format, "prot",
                        args_srna, prefixs, "blastx", "nr", args_srna.e_nr)
        if "blast_srna" in args_srna.import_info:
            self._blast(args_srna.srna_database, args_srna.srna_format, "nucl",
                        args_srna, prefixs, "blastn", "sRNA", args_srna.e_srna)
        if "sorf" in args_srna.import_info:
            for prefix in prefixs:
                if ("_".join([prefix, "sORF.gff"]) in
                        os.listdir(self.sorf_path)):
                    tmp_srna = os.path.join(args_srna.out_folder,
                                            "".join(["tmp_srna_sorf", prefix]))
                    tmp_sorf = os.path.join(args_srna.out_folder,
                                            "".join(["tmp_sorf_srna", prefix]))
                    srna_sorf_comparison(
                            "_".join([self.prefixs["basic"], prefix]),
                            os.path.join(self.sorf_path,
                                         "_".join([prefix, "sORF.gff"])),
                            tmp_srna, tmp_sorf)
                    os.remove(tmp_sorf)
                    shutil.move(tmp_srna,
                                "_".join([self.prefixs["basic"], prefix]))

    def _import_info_format(self, import_info):
        new_info = []
        for info in import_info:
            info = info.lower()
            new_info.append(info)
        return new_info

    def _gen_table(self, prefixs, args_srna):
        for prefix in prefixs:
            out_table = os.path.join(self.all_best["all_table"],
                                     "_".join([prefix, "sRNA.csv"]))
            gen_srna_table(os.path.join(self.all_best["all_gff"],
                           "_".join([prefix, "sRNA.gff"])),
                           "_".join([self.prefixs["merge_table"], prefix]),
                           "_".join([self.tmps["nr"], prefix + ".csv"]),
                           "_".join([self.tmps["srna"], prefix + ".csv"]),
                           args_srna, out_table)

    def _print_rank_all(self, prefixs):
        for prefix in prefixs:
            all_table = os.path.join(self.all_best["all_table"],
                                     "_".join([prefix, "sRNA.csv"]))
            best_table = os.path.join(self.all_best["best_table"],
                                      "_".join([prefix, "sRNA.csv"]))
            print_rank_all(all_table, best_table)

    def _filter_min_utr(self, prefixs, min_utr):
        for prefix in prefixs:
            filter_utr(os.path.join(self.all_best["all_gff"],
                                    "_".join([prefix, "sRNA.gff"])),
                       os.path.join(self.all_best["all_table"],
                                    "_".join([prefix, "sRNA.csv"])), min_utr)

    def _antisense(self, gffs, prefixs):
        for prefix in prefixs:
            all_table = os.path.join(self.all_best["all_table"],
                                     "_".join([prefix, "sRNA.csv"]))
            best_table = os.path.join(self.all_best["best_table"],
                                      "_".join([prefix, "sRNA.csv"]))
            all_gff = os.path.join(self.all_best["all_gff"],
                                   "_".join([prefix, "sRNA.gff"]))
            best_gff = os.path.join(self.all_best["best_gff"],
                                    "_".join([prefix, "sRNA.gff"]))
            srna_antisense(all_gff, all_table,
                           os.path.join(gffs, prefix + ".gff"))
            srna_antisense(best_gff, best_table,
                           os.path.join(gffs, prefix + ".gff"))

    def _blast_stat(self, stat_path, srna_tables):
        for srna_table in os.listdir(os.path.join(srna_tables, "best")):
            out_srna_blast = os.path.join(
                    stat_path, "stat_" +
                    srna_table.replace(".csv", "_blast.csv"))
        blast_class(os.path.join(srna_tables, "best", srna_table),
                    out_srna_blast)

    def _compare_term_promoter(self, out_table, prefix, args_srna):
        if ("term" in args_srna.import_info) and (
                self.term_path is not None):
            compare_srna_term(os.path.join(self.all_best["all_gff"],
                              "_".join([prefix, "sRNA.gff"])),
                              out_table, os.path.join(self.term_path,
                              "_".join([prefix, "term.gff"])),
                              args_srna.fuzzy_b, args_srna.fuzzy_a)
        if ("promoter" in args_srna.import_info) and (
                args_srna.promoter_table is not None) and (
                "tss" in args_srna.import_info):
            compare_srna_promoter(os.path.join(self.all_best["all_gff"],
                                  "_".join([prefix, "sRNA.gff"])),
                                  out_table, args_srna)

    def run_srna_detection(self, args_srna):
        self._check_necessary_file(args_srna)
        self.multiparser.parser_gff(args_srna.trans, "transcript")
        self.multiparser.combine_gff(args_srna.gffs, self.tran_path,
                                     None, "transcript")
        args_srna.import_info = self._import_info_format(args_srna.import_info)
        prefixs = self._run_program(args_srna)
        self._filter_srna(args_srna, prefixs)
        for prefix in prefixs:
            shutil.copyfile("_".join([self.prefixs["basic"], prefix]),
                            os.path.join(self.all_best["all_gff"],
                            "_".join([prefix, "sRNA.gff"])))
            self._compare_term_promoter("_".join([self.prefixs["merge_table"],
                                        prefix]), prefix, args_srna)
        self._gen_table(prefixs, args_srna)
        self._class_srna(prefixs, args_srna)
        self._get_best_result(prefixs, args_srna)
        self._print_rank_all(prefixs)
        if "blast_srna" in args_srna.import_info:
            self._blast_stat(self.stat_path, self.table_output)
        self._remove_file(args_srna)
示例#2
0
class sORFDetection(object):
    '''detection of sORF'''

    def __init__(self, args_sorf):
        self.multiparser = Multiparser()
        self.helper = Helper()
        if args_sorf.tsss is not None:
            self.tss_path = os.path.join(args_sorf.tsss, "tmp")
        else:
            self.tss_path = None
        if args_sorf.srnas is not None:
            self.srna_path = os.path.join(args_sorf.srnas, "tmp")
        else:
            self.srna_path = None
        self.gff_output = os.path.join(args_sorf.out_folder, "gffs")
        self.table_output = os.path.join(args_sorf.out_folder, "tables")
        self.tran_path = os.path.join(args_sorf.trans, "tmp")
        self.fasta_path = os.path.join(args_sorf.fastas, "tmp")
        self.all_cand = "all_candidates"
        self.best = "best_candidates"

    def _check_gff(self, gffs):
        for gff in os.listdir(gffs):
            if gff.endswith(".gff"):
                self.helper.check_uni_attributes(os.path.join(gffs, gff))

    def _check_necessary_files(self, args_sorf, log):
        if (args_sorf.gffs is None) or (args_sorf.trans is None) or (
               (args_sorf.tex_wigs is None) and (args_sorf.frag_wigs is None)):
            print("Error: lack required files!")
            log.write("genome annotation, transcript file or wiggle files "
                      "are not assigned.\n")
            sys.exit()
        if args_sorf.utr_detect:
            if (args_sorf.tsss is None):
                print("Error: TSS files are required for UTR derived"
                      " sORF detection!")
                log.write("TSS files are required for UTR derived"
                          " sORF detection!\n")
                sys.exit()
        self._check_gff(args_sorf.gffs)
        self.multiparser.parser_gff(args_sorf.gffs, None)
        if args_sorf.tsss is not None:
            self._check_gff(args_sorf.tsss)
            self.multiparser.parser_gff(args_sorf.tsss, "TSS")
            self.multiparser.combine_gff(args_sorf.gffs, self.tss_path,
                                         None, "TSS")
        self._check_gff(args_sorf.trans)
        if args_sorf.srnas is not None:
            self._check_gff(args_sorf.srnas)
            self.multiparser.parser_gff(args_sorf.srnas, "sRNA")
            self.multiparser.combine_gff(args_sorf.gffs, self.srna_path,
                                         None, "sRNA")

    def _start_stop_codon(self, prefixs, args_sorf, log):
        '''detect the sORF based on start and stop codon 
        and ribosome binding site'''
        log.write("Running sORF_detection.py for detecting sORFs.\n")
        log.write("The following files are generated:\n")
        for prefix in prefixs:
            print("Searching sORFs of {0}".format(prefix))
            if self.srna_path is not None:
                srna_file = os.path.join(self.srna_path,
                                         "_".join([prefix, "sRNA.gff"]))
            else:
                srna_file = None
            if self.tss_path is not None:
                tss_file = os.path.join(self.tss_path,
                                        "_".join([prefix, "TSS.gff"]))
            else:
                tss_file = None
            sorf_detection(os.path.join(self.fasta_path, prefix + ".fa"),
                           srna_file, os.path.join(args_sorf.out_folder,
                           "_".join([prefix, "inter.gff"])), tss_file,
                           os.path.join(args_sorf.wig_path,
                           "_".join([prefix, "forward.wig"])),
                           os.path.join(args_sorf.wig_path,
                           "_".join([prefix, "reverse.wig"])),
                           os.path.join(self.gff_output, self.all_cand,
                           "_".join([prefix, "sORF"])), args_sorf)
            if "_".join([prefix, "sORF_all.gff"]) in os.listdir(
                         os.path.join(self.gff_output, self.all_cand)):
                gff_all = os.path.join(self.gff_output, self.all_cand,
                                       "_".join([prefix, "sORF.gff"]))
                gff_best = os.path.join(self.gff_output, self.best,
                                        "_".join([prefix, "sORF.gff"]))
                csv_all = os.path.join(self.table_output, self.all_cand,
                                       "_".join([prefix, "sORF.csv"]))
                csv_best =  os.path.join(self.table_output, self.best,
                                         "_".join([prefix, "sORF.csv"]))
                shutil.move(os.path.join(self.gff_output, self.all_cand,
                            "_".join([prefix, "sORF_all.gff"])), gff_all)
                shutil.move(os.path.join(self.gff_output, self.all_cand,
                            "_".join([prefix, "sORF_best.gff"])), gff_best)
                shutil.move(os.path.join(self.gff_output, self.all_cand,
                            "_".join([prefix, "sORF_all.csv"])), csv_all)
                shutil.move(os.path.join(self.gff_output, self.all_cand,
                            "_".join([prefix, "sORF_best.csv"])), csv_best)
                log.write("\t" + gff_all + "\n")
                log.write("\t" + gff_best + "\n")
                log.write("\t" + csv_all + "\n")
                log.write("\t" + csv_best + "\n")

    def _remove_tmp(self, args_sorf):
        self.helper.remove_all_content(args_sorf.out_folder, ".gff", "file")
        self.helper.remove_tmp_dir(args_sorf.fastas)
        self.helper.remove_tmp_dir(args_sorf.gffs)
        self.helper.remove_tmp_dir(args_sorf.tsss)
        self.helper.remove_tmp_dir(args_sorf.trans)
        self.helper.remove_tmp_dir(args_sorf.srnas)
        if "temp_wig" in os.listdir(args_sorf.out_folder):
            shutil.rmtree(os.path.join(args_sorf.out_folder, "temp_wig"))
        if "merge_wigs" in os.listdir(args_sorf.out_folder):
            shutil.rmtree(os.path.join(args_sorf.out_folder, "merge_wigs"))

    def _compare_tran_cds(self, args_sorf, log):
        '''compare transcript and CDS to find the intergenic region'''
        prefixs = []
        log.write("Running sORF_intergenic.py to extract the sequences of "
                  "potential sORFs\n")
        for gff in os.listdir(args_sorf.gffs):
            if gff.endswith(".gff"):
                prefix = gff.replace(".gff", "")
                prefixs.append(prefix)
                print("Comparing transcripts and CDSs of {0}".format(prefix))
                get_intergenic(os.path.join(args_sorf.gffs, gff),
                               os.path.join(self.tran_path,
                               "_".join([prefix, "transcript.gff"])),
                               os.path.join(args_sorf.out_folder,
                               "_".join([prefix, "inter.gff"])),
                               args_sorf.utr_detect, args_sorf.hypo,
                               args_sorf.extend_5, args_sorf.extend_3)
                log.write("\t" + os.path.join(args_sorf.out_folder,
                          "_".join([prefix, "inter.gff"])) + 
                          " is generated to temporary store the sequences.\n")
        return prefixs

    def _re_table(self, args_sorf, prefixs, log):
        log.write("Running re_table.py for generating coverage information.\n")
        log.write("The following files are updated:\n")
        for type_ in ["all_candidates", "best_candidates"]:
            for prefix in prefixs:
                table_file = os.path.join(args_sorf.out_folder, "tables",
                                          type_, "_".join([
                                          prefix, "sORF.csv"]))
                reorganize_table(args_sorf.libs, args_sorf.merge_wigs,
                                 "Track_detail", table_file)
                log.write("\t" + table_file + "\n")

    def run_sorf_detection(self, args_sorf, log):
        if args_sorf.fuzzy_rbs > 6:
            log.write("--fuzzy_rbs should be equal or less than 6!\n")
            print("Error: --fuzzy_rbs should be equal or less than 6!")
            sys.exit()
        self._check_necessary_files(args_sorf, log)
        self.multiparser.parser_gff(args_sorf.trans, "transcript")
        self.multiparser.combine_gff(args_sorf.gffs, self.tran_path,
                                     None, "transcript")
        self.multiparser.parser_fasta(args_sorf.fastas)
        self.multiparser.combine_fasta(args_sorf.gffs, self.fasta_path, None)
        prefixs = self._compare_tran_cds(args_sorf, log)
        self._start_stop_codon(prefixs, args_sorf, log)
        log.write("Running stat_sorf.py to do statistics.\n")
        for sorf in os.listdir(os.path.join(self.gff_output, self.all_cand)):
            print("Running statistics of {0}".format(sorf))
            if sorf.endswith("_sORF.gff"):
                stat_file = os.path.join(args_sorf.out_folder, "statistics",
                            "_".join(["stat", sorf.replace(".gff", ".csv")]))
                stat(os.path.join(self.gff_output, self.all_cand, sorf),
                     os.path.join(self.gff_output, self.best, sorf), stat_file,
                     args_sorf.utr_detect)
                log.write("\t" + stat_file + " is generated.\n")
        self._re_table(args_sorf, prefixs, log)
        self._remove_tmp(args_sorf)
示例#3
0
class TSSpredator(object):

    def __init__(self, args_tss):
        self.multiparser = Multiparser()
        self.helper = Helper()
        self.converter = Converter()
        self.master = os.path.join(args_tss.out_folder, "MasterTables")
        self.tmps = {"tss": "tmp_TSS", "ta_tss": "tmp_ta_tss", "tss_ta":
                     "tmp_tss", "tmp": "tmp"}
        if args_tss.ta_files is not None:
            self.tmps["ta"] = os.path.join(args_tss.ta_files, "tmp")
        else:
            self.tmps["ta"] = None
        self.gff_path = os.path.join(args_tss.gffs, "tmp")
        if args_tss.manual is not None:
            self.manual_path = os.path.join(args_tss.manual, "tmp")
        self.wig_path = os.path.join(args_tss.wig_folder, "tmp")
        self.fasta_path = os.path.join(args_tss.fastas, "tmp")
        self.stat_outfolder = os.path.join(args_tss.out_folder, "statistics")
        self.gff_outfolder = os.path.join(args_tss.out_folder, "gffs")

    def _assign_dict(self, lib_datas):
        return {"wig": lib_datas[0],
                "tex": lib_datas[1],
                "condition": int(lib_datas[2]),
                "replicate": lib_datas[3],
                "strand": lib_datas[4]}

    def _print_lib(self, lib_num, lib_list, out, wig_folder, prefix, rep_set):
        for num_id in range(1, lib_num+1):
            cond_list = []
            for lib in lib_list:
                if num_id == lib["condition"]:
                    cond_list.append(lib)
            cond_sort_list = sorted(cond_list, key=lambda k: k['replicate'])
            reps = []
            for cond in cond_sort_list:
                out.write("{0}_{1}{2} = {3}\n".format(
                          prefix, cond["condition"], cond["replicate"],
                          os.path.join(wig_folder, cond["wig"])))
                reps.append(cond["replicate"])
            for rep in sorted(rep_set):
                if rep not in reps:
                    out.write("{0}_{1}{2} = \n".format(
                              prefix, cond["condition"], rep))

    def _start_to_run(self, tsspredator_path, config_file, out_path, prefix, log):
        print("Running TSSpredator for " + prefix)
        log.write("Make sure the version of TSSpredator is at least 1.06.\n")
        out = open(os.path.join(out_path, "log.txt"), "w")
        err = open(os.path.join(out_path, "err.txt"), "w")
        log.write(" ".join(["java", "-jar", tsspredator_path,
                            config_file]) + "\n")
        call(["java", "-jar", tsspredator_path,
              config_file], stdout=out, stderr=err)
        out.close()
        err.close()
        log.write("Done!\n")
        log.write("The following files are generated in {0}:\n".format(out_path))
        for file_ in os.listdir(out_path):
            log.write("\t" + file_ + "\n")

    def _import_lib(self, libs, wig_folder, project_strain_name,
                    out, gff, program, fasta):
        lib_dict = {"fp": [], "fm": [], "nm": [], "np": []}
        lib_num = 0
        rep_set = set()
        list_num_id = []
        for lib in libs:
            lib_datas = lib.split(":")
            if not lib_datas[0].endswith(".wig"):
                print("Error: Wiggle files are not end with .wig!")
                sys.exit()
            for wig in os.listdir(wig_folder):
                filename = wig.split("_STRAIN_")
                if (filename[0] == lib_datas[0][:-4]) and (
                        filename[1][:-4] == project_strain_name):
                    lib_datas[0] = wig
            if int(lib_datas[2]) > lib_num:
                lib_num = int(lib_datas[2])
            if lib_datas[3] not in rep_set:
                rep_set.add(lib_datas[3])
            if (lib_datas[1] == "tex") and (lib_datas[4] == "+"):
                lib_dict["fp"].append(self._assign_dict(lib_datas))
            elif (lib_datas[1] == "tex") and (lib_datas[4] == "-"):
                lib_dict["fm"].append(self._assign_dict(lib_datas))
            elif (lib_datas[1] == "notex") and (lib_datas[4] == "+"):
                lib_dict["np"].append(self._assign_dict(lib_datas))
            elif (lib_datas[1] == "notex") and (lib_datas[4] == "-"):
                lib_dict["nm"].append(self._assign_dict(lib_datas))
        for num_id in range(1, lib_num+1):
            out.write("annotation_{0} = {1}\n".format(num_id, gff))
        if program.lower() == "tss":
            self._print_lib(lib_num, lib_dict["fm"], out,
                            wig_folder, "fivePrimeMinus", rep_set)
            self._print_lib(lib_num, lib_dict["fp"], out,
                            wig_folder, "fivePrimePlus", rep_set)
        elif program.lower() == "ps":
            self._print_lib(lib_num, lib_dict["nm"], out,
                            wig_folder, "fivePrimeMinus", rep_set)
            self._print_lib(lib_num, lib_dict["np"], out,
                            wig_folder, "fivePrimePlus", rep_set)
        else:
            print("Error: Wrong program name! Please assing tss "
                  "or processing_site.")
            sys.exit()
        for num_id in range(1, lib_num+1):
            out.write("genome_{0} = {1}\n".format(num_id, fasta))
        for num_id in range(1, lib_num+1):
            list_num_id.append(str(num_id))
        return lib_num, num_id, rep_set, lib_dict, list_num_id

    def _print_repmatch(self, args_tss, out):
        '''check replicate match'''
        detect_all = False
        for rep in args_tss.repmatch:
            if "all" in rep:
                detect_all = True
                match = rep.split("_")[-1]
                out.write("minNumRepMatches = {0}\n".format(match))
                break
        if not detect_all:
            nums = {}
            matchs = {}
            for match in args_tss.repmatch:
                lib = match.split("_")[0]
                rep = match.split("_")[-1]
                matchs[lib] = rep
                if rep not in nums.keys():
                    nums[rep] = 1
                else:
                    nums[rep] += 1
            for rep, num in nums.items():
                if num == max(nums.values()):
                    out.write("minNumRepMatches = {0}\n".format(rep))
                    max_rep = rep
                    break
            for lib, rep in matchs.items():
                if rep != max_rep:
                    out.write("minNumRepMatches_{0} = {1}\n".format(
                        lib, rep))

    def _extract_best_para(self, args_tss, prefix, log):
        detect = False
        for best_file in os.listdir(args_tss.auto_load):
            if best_file == "_".join(["best", prefix + ".csv"]):
                bh = open(os.path.join(args_tss.auto_load, best_file),"r" )
                lines = bh.readlines()
                bh.close()
                if len(lines[len(lines)-1].split("\t")) < 8:
                    print("Error: some information in {0} is missing. "
                          "It may be due to that \"optimize_tss_ps\" did "
                          "not finish successfully.".format(best_file))
                    log.write("Error: some information in {0} is missing. "
                              "It may be due to that \"optimize_tss_ps\" did "
                              "not finish successfully.\n".format(best_file))
                    sys.exit()
                else:
                    para_info = lines[len(lines)-1].split("\t")[1].split("_")
                    detect_all =  all(elem in para_info
                            for elem in ["he", "rh", "fa", "rf",
                                         "bh", "ef", "pf"])
                    if (not detect_all) or (len(para_info) != 14):
                        print("Error: {0} is complete. Some parameters are "
                              "missing!".format(best_file))
                        log.write("Error: {0} is complete. Some parameters "
                                  "are missing!\n".format(best_file))
                        sys.exit()
                    else:
                        detect = True
                        height = para_info[para_info.index("he") + 1]
                        height_reduction = para_info[
                            para_info.index("rh") + 1]
                        factor = para_info[para_info.index("fa") + 1]
                        factor_reduction = para_info[
                            para_info.index("rf") + 1]
                        base_height = para_info[
                            para_info.index("bh") + 1]
                        enrichment_factor = para_info[
                            para_info.index("ef") + 1]
                        processing_factor = para_info[
                            para_info.index("pf") + 1]
        if detect:
            return height, height_reduction, factor, factor_reduction, \
                   base_height, enrichment_factor, processing_factor
        else:
            print("Error: No best_{0}.csv can be found in {1}! ".format(
                prefix, args_tss.auto_load))
            log.write("Error: No best_{0}.csv can be found in {1}\n".format(
                prefix, args_tss.auto_load))
            sys.exit()

    def _get_input_para(self, args_tss, prefix, log):
        if args_tss.genome_order is None:
            height = args_tss.height[0]
            height_reduction = args_tss.height_reduction[0]
            factor = args_tss.factor[0]
            factor_reduction = args_tss.factor_reduction[0]
            base_height = args_tss.base_height[0]
            enrichment_factor = args_tss.enrichment_factor[0]
            processing_factor = args_tss.processing_factor[0]
        else:
            if prefix not in args_tss.genome_order:
                print("Error: the parameters for {0} were not assigned!".format(
                    prefix))
                log.write("Error: the parameters for {0} were not assigned!\n".format(
                    prefix))
                sys.exit()
            else:
                index = args_tss.genome_order.index(prefix)
                height = args_tss.height[index]
                height_reduction = args_tss.height_reduction[index]
                factor = args_tss.factor[index]
                factor_reduction = args_tss.factor_reduction[index]
                base_height = args_tss.base_height[index]
                enrichment_factor = args_tss.enrichment_factor[index]
                processing_factor = args_tss.processing_factor[index]
        return height, height_reduction, factor, factor_reduction, \
               base_height, enrichment_factor, processing_factor

    def _gen_config(self, project_strain_name, args_tss, gff,
                    wig_folder, fasta, config_file, log):
        '''generation of config files'''
        log.write("Generating config files for TSSpredator.\n")
        if args_tss.auto_load is not None:
            height, height_reduction, factor, factor_reduction, \
            base_height, enrichment_factor, processing_factor = \
            self._extract_best_para(args_tss, project_strain_name, log)
        else:
            height, height_reduction, factor, factor_reduction, \
            base_height, enrichment_factor, processing_factor = \
            self._get_input_para(args_tss, project_strain_name, log)
        master_folder = "MasterTable_" + project_strain_name
        out_path = os.path.join(self.master, master_folder)
        self.helper.check_make_folder(out_path)
        out = open(config_file, "w")
        out.write("TSSinClusterSelectionMethod = HIGHEST\n")
        out.write("allowedCompareShift = 1\n")
        out.write("allowedRepCompareShift = 1\n")
        lib_num, num_id, rep_set, lib_dict, list_num_id = \
            self._import_lib(args_tss.libs, wig_folder, project_strain_name,
                             out, gff, args_tss.program, fasta)
        out.write("idList = ")
        out.write(",".join(list_num_id) + "\n")
        out.write("maxASutrLength = 100\n")
        out.write("maxGapLengthInGene = 500\n")
        out.write("maxNormalTo5primeFactor = {0}\n".format(
                  processing_factor))
        out.write("maxTSSinClusterDistance = {0}\n".format(
                  args_tss.cluster + 1))
        out.write("maxUTRlength = {0}\n".format(args_tss.utr_length))
        out.write("min5primeToNormalFactor = {0}\n".format(
                  enrichment_factor))
        out.write("minCliffFactor = {0}\n".format(factor))
        out.write("minCliffFactorDiscount = {0}\n".format(
                  factor_reduction))
        out.write("minCliffHeight = {0}\n".format(height))
        out.write("minCliffHeightDiscount = {0}\n".format(
                  height_reduction))
        out.write("minNormalHeight = {0}\n".format(base_height))
        self._print_repmatch(args_tss, out)
        out.write("minPlateauLength = 0\n")
        out.write("mode = cond\n")
        out.write("normPercentile = 0.9\n")
        if args_tss.program.lower() == "tss":
            self._print_lib(lib_num, lib_dict["nm"], out,
                            wig_folder, "normalMinus", rep_set)
            self._print_lib(lib_num, lib_dict["np"], out,
                            wig_folder, "normalPlus", rep_set)
        else:
            self._print_lib(lib_num, lib_dict["fm"], out,
                            wig_folder, "normalMinus", rep_set)
            self._print_lib(lib_num, lib_dict["fp"], out,
                            wig_folder, "normalPlus", rep_set)
        out.write("numReplicates = {0}\n".format(len(rep_set)))
        out.write("numberOfDatasets = {0}\n".format(lib_num))
        out.write("outputDirectory = {0}\n".format(out_path))
        for prefix_id in range(len(args_tss.output_prefixs)):
            out.write("outputPrefix_{0} = {1}\n".format(
                      prefix_id + 1, args_tss.output_prefixs[prefix_id]))
        out.write("projectName = {0}\n".format(project_strain_name))
        out.write("superGraphCompatibility = igb\n")
        out.write("texNormPercentile = 0.5\n")
        out.write("writeGraphs = 0\n")
        out.write("writeNocornacFiles = 0\n")
        log.write("\t" + config_file + " is generated.\n")
        out.close()

    def _convert_gff(self, prefixs, args_tss, log):
        for prefix in prefixs:
            out_file = os.path.join(self.gff_outfolder, "_".join([
                           prefix, args_tss.program]) + ".gff")
            gff_f = open(out_file, "w")
            out_path = os.path.join(self.master, "_".join([
                           "MasterTable", prefix]))
            if "MasterTable.tsv" not in os.listdir(out_path):
                print("Error: There is not MasterTable file in {0} ".format(
                      out_path))
                print("Please check configuration file.")
                log.write("not MasterTable file is found in {0}\n".format(
                           out_path))
            else:
                if args_tss.program.lower() == "processing":
                    feature = "processing_site"
                elif args_tss.program.lower() == "tss":
                    feature = "TSS"
                self.converter.convert_mastertable2gff(
                    os.path.join(out_path, "MasterTable.tsv"),
                    "ANNOgesic", feature, prefix, out_file)
                log.write("\t" + out_file + "is generated.\n")
            gff_f.close()

    def _merge_manual(self, tsss, args_tss):
        '''if manual detected TSS is provided, it can merge manual detected TSS 
        and TSSpredator predicted TSS'''
        self.helper.check_make_folder(os.path.join(os.getcwd(),
                                      self.tmps["tss"]))
        for tss in tsss:
            for gff in os.listdir(args_tss.gffs):
                if (gff[:-4] == tss) and (".gff" in gff):
                    break
            filename = "_".join([tss, args_tss.program]) + ".gff"
            predict = os.path.join(self.gff_outfolder, filename)
            manual = os.path.join(self.manual_path, tss + ".gff")
            fasta = os.path.join(self.fasta_path, tss + ".fa")
            stat_file = "stat_compare_TSSpredator_manual_{0}.csv".format(tss)
            if os.path.exists(manual):
                print("Merging and classiflying manually-detected "
                      "TSSs for {0}".format(tss))
                merge_manual_predict_tss(
                    predict, stat_file,
                    os.path.join(self.tmps["tss"], filename),
                    os.path.join(args_tss.gffs, gff), args_tss, manual, fasta)
            if os.path.exists(stat_file):
                shutil.move(stat_file, os.path.join(
                    args_tss.out_folder, "statistics", tss, stat_file))
        self.helper.move_all_content(self.tmps["tss"],
                                     self.gff_outfolder, [".gff"])
        shutil.rmtree(self.tmps["tss"])

    def _validate(self, tsss, args_tss, log):
        '''validate TSS with genome annotation'''
        print("Validating TSSs with genome annotations")
        log.write("Running validate_gene.py to compare genome "
                  "annotations and TSSs/PSs.\n")
        for tss in tsss:
            for gff in os.listdir(args_tss.gffs):
                if (gff[:-4] == tss) and (".gff" in gff):
                    break
            stat_file = os.path.join(
                    self.stat_outfolder, tss,
                    "".join(["stat_gene_vali_", tss, ".csv"]))
            out_cds_file = os.path.join(args_tss.out_folder, "tmp.gff")
            if args_tss.program.lower() == "tss":
                compare_file = os.path.join(self.gff_outfolder,
                                            "_".join([tss, "TSS.gff"]))
            elif args_tss.program.lower() == "processing":
                compare_file = os.path.join(self.gff_outfolder,
                                            "_".join([tss, "processing.gff"]))
            validate_gff(compare_file, os.path.join(args_tss.gffs, gff),
                         stat_file, out_cds_file, args_tss.utr_length,
                         args_tss.program.lower())
            log.write("\t" + stat_file + " is generated.\n")
            shutil.move(out_cds_file, os.path.join(args_tss.gffs, gff))

    def _compare_ta(self, tsss, args_tss, log):
        '''compare TSS with transcript'''
        detect = False
        log.write("Running stat_TA_comparison to compare transcripts "
                  "and TSSs/PSs.\n")
        print("Comparing transcripts and TSSs")
        self.multiparser.parser_gff(args_tss.ta_files, "transcript")
        self.multiparser.combine_gff(args_tss.gffs, self.tmps["ta"],
                                     None, "transcript")
        for tss in tsss:
            stat_out = os.path.join(
                    self.stat_outfolder, tss, "".join([
                        "stat_compare_TSS_transcript_",
                        tss, ".csv"]))
            for ta in os.listdir(self.tmps["ta"]):
                filename = ta.split("_transcript")
                if (filename[0] == tss) and (filename[1] == ".gff"):
                    detect = True
                    break
            compare_file = os.path.join(self.gff_outfolder,
                                        "_".join([tss, "TSS.gff"]))
            if detect:
                stat_ta_tss(os.path.join(self.tmps["ta"], ta), compare_file,
                            stat_out, self.tmps["ta_tss"],
                            self.tmps["tss_ta"], args_tss.fuzzy)
                self.helper.sort_gff(self.tmps["tss_ta"], compare_file)
                self.helper.sort_gff(self.tmps["ta_tss"],
                                     os.path.join(args_tss.ta_files, ta))
                os.remove(self.tmps["tss_ta"])
                os.remove(self.tmps["ta_tss"])
                detect = False
            log.write("\t" + stat_out + " is generated.\n")

    def _stat_tss(self, tsss, feature, log):
        print("Running statistaics")
        for tss in tsss:
            compare_file = os.path.join(self.gff_outfolder,
                                        "_".join([tss, feature]) + ".gff")
            stat_tsspredator(
                compare_file, feature,
                os.path.join(self.stat_outfolder, tss, "_".join([
                    "stat", feature, "class", tss]) + ".csv"),
                os.path.join(self.stat_outfolder, tss, "_".join([
                    "stat", feature, "libs", tss]) + ".csv"))
            self.helper.move_all_content(os.getcwd(), os.path.join(
                self.stat_outfolder, tss), ["_class", ".png"])
            if os.path.exists(os.path.join(
                    self.stat_outfolder, "TSSstatistics.tsv")):
                shutil.move(
                    os.path.join(
                        self.stat_outfolder, "TSSstatistics.tsv"),
                    os.path.join(
                        self.stat_outfolder, tss, "TSSstatistics.tsv"))
            plot_venn(compare_file, feature)
            self.helper.move_all_content(os.getcwd(), os.path.join(
                self.stat_outfolder, tss), ["_venn", ".png"])
            log.write("The following files in {0} are generated:\n".format(
                (os.path.join(self.stat_outfolder, tss))))
            for file_ in os.listdir(os.path.join(
                    self.stat_outfolder, tss)):
                log.write("\t" + file_ + "\n")

    def _get_prefixs(self, args_tss):
        prefixs = []
        detect = False
        for fasta in os.listdir(self.fasta_path):
            run = False
            for gff in os.listdir(self.gff_path):
                if fasta[:-3] == gff[:-4]:
                    prefix = fasta[:-3]
                    for wig in os.listdir(self.wig_path):
                        filename = wig.split("_STRAIN_")
                        if filename[1][:-4] == prefix:
                            detect = True
                            break
                    if detect:
                        prefixs.append(prefix)
        return prefixs

    def _merge_wigs(self, wig_folder, prefix, libs):
        self.helper.check_make_folder(os.path.join(os.getcwd(),
                                      self.tmps["tmp"]))
        for wig_file in os.listdir(wig_folder):
            for lib in libs:
                info = lib.split(":")
                if (info[0][:-4] in wig_file) and (info[-1] == "+") and (
                        prefix in wig_file) and (
                        os.path.isfile(os.path.join(wig_folder, wig_file))):
                    Helper().merge_file(
                            os.path.join(wig_folder, wig_file),
                            os.path.join("tmp", "merge_forward.wig"))
                if (info[0][:-4] in wig_file) and (info[-1] == "-") and (
                        prefix in wig_file) and (
                        os.path.isfile(os.path.join(wig_folder, wig_file))):
                    Helper().merge_file(
                            os.path.join(wig_folder, wig_file),
                            os.path.join("tmp", "merge_reverse.wig"))

    def _check_orphan(self, prefixs, wig_folder, args_tss):
        '''if genome has no locus tag, it can use for classify the TSS'''
        for prefix in prefixs:
            self._merge_wigs(wig_folder, prefix, args_tss.libs)
            tmp_tss = os.path.join(self.tmps["tmp"], "_".join([
                          prefix, args_tss.program + ".gff"]))
            pre_tss = os.path.join(self.gff_outfolder, "_".join([
                          prefix, args_tss.program + ".gff"]))
            check_orphan(pre_tss, os.path.join(
                args_tss.gffs, prefix + ".gff"),
                "tmp/merge_forward.wig", "tmp/merge_reverse.wig", tmp_tss)
            shutil.move(tmp_tss, pre_tss)
        shutil.rmtree("tmp")

    def _remove_files(self, args_tss):
        print("Remove temperary files and folders")
        self.helper.remove_tmp_dir(args_tss.fastas)
        self.helper.remove_tmp_dir(args_tss.gffs)
        self.helper.remove_tmp_dir(args_tss.ta_files)
        if "merge_forward.wig" in os.listdir(os.getcwd()):
            os.remove("merge_forward.wig")
        if "merge_reverse.wig" in os.listdir(os.getcwd()):
            os.remove("merge_reverse.wig")
        shutil.rmtree(args_tss.wig_folder)
        if args_tss.manual is not None:
            shutil.rmtree(args_tss.manual)

    def _deal_with_overlap(self, out_folder, args_tss):
        '''deal with the situation that TSS and 
        processing site at the same position'''
        if not args_tss.overlap_feature:
            pass
        else:
            print("Comparing TSSs and Processing sites")
            if args_tss.program.lower() == "tss":
                for tss in os.listdir(out_folder):
                    if tss.endswith("_TSS.gff"):
                        ref = self.helper.get_correct_file(
                                args_tss.overlap_gffs, "_processing.gff",
                                tss.replace("_TSS.gff", ""), None, None)
                        filter_tss_pro(os.path.join(out_folder, tss),
                                       ref, args_tss.program,
                                       args_tss.cluster)
            elif args_tss.program.lower() == "processing":
                for tss in os.listdir(out_folder):
                    if tss.endswith("_processing.gff"):
                        ref = self.helper.get_correct_file(
                                args_tss.overlap_gffs, "_TSS.gff",
                                tss.replace("_processing.gff", ""), None, None)
                        filter_tss_pro(os.path.join(out_folder, tss),
                                       ref, args_tss.program,
                                       args_tss.cluster)

    def _low_expression(self, args_tss, gff_folder):
        '''deal with the low expressed TSS'''
        prefix = None
        self._merge_wigs(args_tss.wig_folder, "wig", args_tss.libs)
        for gff in os.listdir(gff_folder):
            if (args_tss.program.lower() == "tss") and (
                    gff.endswith("_TSS.gff")):
                prefix = gff.replace("_TSS.gff", "")
            elif (args_tss.program.lower() == "processing") and (
                    gff.endswith("_processing.gff")):
                prefix = gff.replace("_processing.gff", "")
            if prefix:
                out = open(os.path.join(
                    self.stat_outfolder, prefix, "_".join([
                        "stat", prefix, "low_expression_cutoff.csv"])), "w")
                out.write("\t".join(["Genome", "Cutoff_coverage"]) + "\n")
                cutoff = filter_low_expression(
                        os.path.join(gff_folder, gff), args_tss,
                        "tmp/merge_forward.wig", "tmp/merge_reverse.wig",
                        "tmp/without_low_expression.gff")
                out.write("\t".join([prefix, str(cutoff)]) + "\n")
                os.remove(os.path.join(gff_folder, gff))
                shutil.move("tmp/without_low_expression.gff",
                            os.path.join(gff_folder, gff))
                prefix = None
        out.close()

    def run_tsspredator(self, args_tss, log):
        input_folder = os.path.join(args_tss.out_folder, "configs")
        for gff in os.listdir(args_tss.gffs):
            if gff.endswith(".gff"):
                self.helper.check_uni_attributes(os.path.join(
                                                 args_tss.gffs, gff))
        self.helper.check_make_folder(self.gff_outfolder)
        self.multiparser.parser_fasta(args_tss.fastas)
        self.multiparser.parser_gff(args_tss.gffs, None)
        self.multiparser.parser_wig(args_tss.wig_folder)
        prefixs = self._get_prefixs(args_tss)
        for prefix in prefixs:
            config = os.path.join(input_folder,
                                  "_".join(["config", prefix]) + ".ini")
            self._gen_config(
                prefix, args_tss,
                os.path.join(self.gff_path, prefix + ".gff"), self.wig_path,
                os.path.join(self.fasta_path, prefix + ".fa"), config, log)
            out_path = os.path.join(
                    self.master, "_".join(["MasterTable", prefix]))
            config_file = os.path.join(
                    input_folder, "_".join(["config", prefix]) + ".ini")
            self._start_to_run(args_tss.tsspredator_path, config_file,
                               out_path, prefix, log)
            if os.path.exists(os.path.join(out_path, "TSSstatistics.tsv")):
                shutil.move(os.path.join(out_path, "TSSstatistics.tsv"),
                            os.path.join(
                                self.stat_outfolder, "TSSstatistics.tsv"))
        if args_tss.program.lower() == "ps":
            args_tss.program = "processing"
        self._convert_gff(prefixs, args_tss, log)
        if args_tss.check_orphan:
            print("checking the orphan TSSs")
            log.write("Running check_orphan.py to re-check orphan TSSs.\n")
            self._check_orphan(prefixs,
                               os.path.join(args_tss.wig_folder, "tmp"),
                               args_tss)
        self.multiparser.combine_gff(args_tss.gffs, self.gff_outfolder,
                                     None, args_tss.program)
        datas = []
        for gff in os.listdir(self.gff_outfolder):
            if gff.endswith(".gff"):
                gff_folder = gff.replace("".join(["_", args_tss.program,
                                                  ".gff"]), "")
                self.helper.check_make_folder(
                     os.path.join(self.stat_outfolder, gff_folder))
                datas.append(gff_folder)
        if args_tss.remove_low_expression is not None:
            log.write("Running filter_low_expression.py to filter out "
                      "low expressed TSS/PS.\n")
            self._low_expression(args_tss, self.gff_outfolder)
        if args_tss.manual is not None:
            self.multiparser.parser_gff(args_tss.manual, None)
            self.multiparser.combine_gff(args_tss.gffs, self.manual_path,
                                         None, None)
            self.multiparser.combine_fasta(args_tss.gffs, self.fasta_path,
                                         None)
            self.multiparser.combine_wig(args_tss.gffs, self.wig_path,
                                         None, args_tss.libs)
            log.write("Running merge_manual.py to merge the manual TSSs.\n")
            self._merge_manual(datas, args_tss)
        log.write("Running filter_TSS_pro.py to deal with the overlap "
                  "position between TSS and PS.\n")
        self._deal_with_overlap(self.gff_outfolder, args_tss)
        log.write("Running stat_TSSpredator.py to do statistics.\n")
        self._stat_tss(datas, args_tss.program, log)
        if args_tss.validate:
            self._validate(datas, args_tss, log)
        if args_tss.ta_files is not None:
            self._compare_ta(datas, args_tss, log)
        self._remove_files(args_tss)
示例#4
0
文件: sorf.py 项目: Pranjpb/ANNOgesic
class sORFDetection(object):
    '''detection of sORF'''

    def __init__(self, args_sorf):
        self.multiparser = Multiparser()
        self.helper = Helper()
        if args_sorf.tsss is not None:
            self.tss_path = os.path.join(args_sorf.tsss, "tmp")
        else:
            self.tss_path = None
        if args_sorf.srnas is not None:
            self.srna_path = os.path.join(args_sorf.srnas, "tmp")
        else:
            self.srna_path = None
        self.gff_output = os.path.join(args_sorf.out_folder, "gffs")
        self.table_output = os.path.join(args_sorf.out_folder, "tables")
        self.tran_path = os.path.join(args_sorf.trans, "tmp")
        self.fasta_path = os.path.join(args_sorf.fastas, "tmp")
        self.all_cand = "all_candidates"
        self.best = "best"

    def _check_gff(self, gffs):
        for gff in os.listdir(gffs):
            if gff.endswith(".gff"):
                self.helper.check_uni_attributes(os.path.join(gffs, gff))

    def _check_necessary_files(self, args_sorf):
        if (args_sorf.gffs is None) or \
           (args_sorf.trans is None) or \
           ((args_sorf.tex_wigs is None) and (args_sorf.frag_wigs is None)):
            print("Error: lack required files!!!!")
            sys.exit()
        if args_sorf.utr_detect:
            if (args_sorf.tsss is None):
                print("Error: lack required files for UTR derived"
                      " sORF detection!!!!")
                sys.exit()
        self._check_gff(args_sorf.gffs)
        self.multiparser.parser_gff(args_sorf.gffs, None)
        if args_sorf.tsss is not None:
            self._check_gff(args_sorf.tsss)
            self.multiparser.parser_gff(args_sorf.tsss, "TSS")
            self.multiparser.combine_gff(args_sorf.gffs, self.tss_path,
                                         None, "TSS")
        self._check_gff(args_sorf.trans)
        if args_sorf.srnas is not None:
            self._check_gff(args_sorf.srnas)
            self.multiparser.parser_gff(args_sorf.srnas, "sRNA")
            self.multiparser.combine_gff(args_sorf.gffs, self.srna_path,
                                         None, "sRNA")

    def _start_stop_codon(self, prefixs, args_sorf):
        '''detect the sORF based on start and stop codon 
        and ribosome binding site'''
        for prefix in prefixs:
            if self.srna_path is not None:
                srna_file = os.path.join(self.srna_path,
                                         "_".join([prefix, "sRNA.gff"]))
            else:
                srna_file = None
            if self.tss_path is not None:
                tss_file = os.path.join(self.tss_path,
                                        "_".join([prefix, "TSS.gff"]))
            else:
                tss_file = None
            sorf_detection(os.path.join(self.fasta_path, prefix + ".fa"),
                           srna_file, os.path.join(args_sorf.out_folder,
                           "_".join([prefix, "inter.gff"])), tss_file,
                           os.path.join(args_sorf.wig_path,
                           "_".join([prefix, "forward.wig"])),
                           os.path.join(args_sorf.wig_path,
                           "_".join([prefix, "reverse.wig"])),
                           os.path.join(self.gff_output, self.all_cand,
                           "_".join([prefix, "sORF"])), args_sorf)
            if "_".join([prefix, "sORF_all.gff"]) in os.listdir(
                         os.path.join(self.gff_output, self.all_cand)):
                shutil.move(os.path.join(self.gff_output, self.all_cand,
                            "_".join([prefix, "sORF_all.gff"])),
                            os.path.join(self.gff_output, self.all_cand,
                            "_".join([prefix, "sORF.gff"])))
                shutil.move(os.path.join(self.gff_output, self.all_cand,
                            "_".join([prefix, "sORF_best.gff"])),
                            os.path.join(self.gff_output, self.best,
                            "_".join([prefix, "sORF.gff"])))
                shutil.move(os.path.join(self.gff_output, self.all_cand,
                            "_".join([prefix, "sORF_all.csv"])),
                            os.path.join(self.table_output, self.all_cand,
                            "_".join([prefix, "sORF.csv"])))
                shutil.move(os.path.join(self.gff_output, self.all_cand,
                            "_".join([prefix, "sORF_best.csv"])),
                            os.path.join(self.table_output, self.best,
                            "_".join([prefix, "sORF.csv"])))

    def _remove_tmp(self, args_sorf):
        self.helper.remove_all_content(args_sorf.out_folder, ".gff", "file")
        self.helper.remove_tmp_dir(args_sorf.fastas)
        self.helper.remove_tmp_dir(args_sorf.gffs)
        self.helper.remove_tmp_dir(args_sorf.tsss)
        self.helper.remove_tmp_dir(args_sorf.trans)
        self.helper.remove_tmp_dir(args_sorf.srnas)
        if "temp_wig" in os.listdir(args_sorf.out_folder):
            shutil.rmtree(os.path.join(args_sorf.out_folder, "temp_wig"))
        if "merge_wigs" in os.listdir(args_sorf.out_folder):
            shutil.rmtree(os.path.join(args_sorf.out_folder, "merge_wigs"))

    def _compare_tran_cds(self, args_sorf):
        '''compare transcript and CDS to find the intergenic region'''
        prefixs = []
        for gff in os.listdir(args_sorf.gffs):
            if gff.endswith(".gff"):
                prefix = gff.replace(".gff", "")
                prefixs.append(prefix)
                print("Comparing transcript and CDS of {0}".format(prefix))
                get_intergenic(os.path.join(args_sorf.gffs, gff),
                               os.path.join(self.tran_path,
                               "_".join([prefix, "transcript.gff"])),
                               os.path.join(args_sorf.out_folder,
                               "_".join([prefix, "inter.gff"])),
                               args_sorf.utr_detect, args_sorf.hypo)
        return prefixs

    def run_sorf_detection(self, args_sorf):
        if args_sorf.fuzzy_rbs > 6:
            print("Error: --fuzzy_rbs should be equal or less than 6!!")
            sys.exit()
        self._check_necessary_files(args_sorf)
        self.multiparser.parser_gff(args_sorf.trans, "transcript")
        self.multiparser.combine_gff(args_sorf.gffs, self.tran_path,
                                     None, "transcript")
        self.multiparser.parser_fasta(args_sorf.fastas)
        self.multiparser.combine_fasta(args_sorf.gffs, self.fasta_path, None)
        prefixs = self._compare_tran_cds(args_sorf)
        self._start_stop_codon(prefixs, args_sorf)
        for sorf in os.listdir(os.path.join(self.gff_output, self.all_cand)):
            print("Running statistics of {0}".format(sorf))
            if sorf.endswith("_sORF.gff"):
                stat(os.path.join(self.gff_output, self.all_cand, sorf),
                     os.path.join(self.gff_output, self.best, sorf),
                     os.path.join(args_sorf.out_folder, "statistics",
                     "_".join(["stat", sorf.replace(".gff", ".csv")])),
                     args_sorf.utr_detect)
        self._remove_tmp(args_sorf)
示例#5
0
class TestMultiparser(unittest.TestCase):
    def setUp(self):
        self.multiparser = Multiparser()
        self.example = Example()
        self.ref_folder = "ref_folder"
        if (not os.path.exists(self.ref_folder)):
            os.mkdir(self.ref_folder)
        self.tar_folder = "tar_folder"
        if (not os.path.exists(self.tar_folder)):
            os.mkdir(self.tar_folder)

    def tearDown(self):
        if os.path.exists(self.ref_folder):
            shutil.rmtree(self.ref_folder)
        if os.path.exists(self.tar_folder):
            shutil.rmtree(self.tar_folder)

    def test_combine_fasta(self):
        tmp_tar = os.path.join(self.tar_folder, "tmp")
        tmp_ref = os.path.join(self.ref_folder, "test.gff_folder")
        os.mkdir(tmp_ref)
        os.mkdir(tmp_tar)
        sub_fasta1 = os.path.join(tmp_tar, "aaa.fa")
        with open(sub_fasta1, "w") as rh:
            rh.write(self.example.sub_fasta1)
        sub_fasta2 = os.path.join(tmp_tar, "bbb.fa")
        with open(sub_fasta2, "w") as rh:
            rh.write(self.example.sub_fasta2)
        sub_gff1 = os.path.join(tmp_ref, "aaa.gff")
        with open(sub_gff1, "w") as rh:
            rh.write(self.example.sub_gff1)
        sub_gff2 = os.path.join(tmp_ref, "bbb.gff")
        with open(sub_gff2, "w") as rh:
            rh.write(self.example.sub_gff2)
        self.multiparser.combine_fasta(self.ref_folder, tmp_tar, None)
        self.assertTrue(os.path.exists(os.path.join(tmp_tar, "test.fa")))

    def test_combine_wig(self):
        tmp_tar = os.path.join(self.tar_folder, "tmp")
        tmp_ref = os.path.join(self.ref_folder, "test.fa_folder")
        os.mkdir(tmp_ref)
        os.mkdir(tmp_tar)
        sub_fasta1 = os.path.join(tmp_ref, "aaa.fa")
        with open(sub_fasta1, "w") as rh:
            rh.write(self.example.sub_fasta1)
        sub_fasta2 = os.path.join(tmp_ref, "bbb.fa")
        with open(sub_fasta2, "w") as rh:
            rh.write(self.example.sub_fasta2)
        sub_wig1 = os.path.join(tmp_tar, "test_forward.wig_STRAIN_aaa.wig")
        sub_wig2 = os.path.join(tmp_tar, "test_forward.wig_STRAIN_bbb.wig")
        sub_wig3 = os.path.join(tmp_tar, "test_reverse.wig_STRAIN_aaa.wig")
        sub_wig4 = os.path.join(tmp_tar, "test_reverse.wig_STRAIN_bbb.wig")
        wig_files = [sub_wig1, sub_wig2, sub_wig3, sub_wig4]
        example_wigs = [
            self.example.sub_f_wig1, self.example.sub_f_wig2,
            self.example.sub_r_wig1, self.example.sub_r_wig2
        ]
        for index in range(0, 4):
            with open(wig_files[index], "w") as fh:
                fh.write(example_wigs[index])
        libs = [
            "test_forward.wig_STRAIN_aaa.wig:frag:1:a:+",
            "test_reverse.wig_STRAIN_aaa.wig:frag:1:a:-"
        ]
        self.multiparser.combine_wig(self.ref_folder, tmp_tar, "fasta", libs)
        self.assertTrue(
            os.path.exists(os.path.join(tmp_tar, "test_forward.wig")))
        self.assertTrue(
            os.path.exists(os.path.join(tmp_tar, "test_reverse.wig")))

    def test_combine_gff(self):
        tmp_tar = os.path.join(self.tar_folder, "tmp")
        tmp_ref = os.path.join(self.ref_folder, "test.fa_folder")
        os.mkdir(tmp_ref)
        os.mkdir(tmp_tar)
        sub_fasta1 = os.path.join(tmp_ref, "aaa.fa")
        with open(sub_fasta1, "w") as rh:
            rh.write(self.example.sub_fasta1)
        sub_fasta2 = os.path.join(tmp_ref, "bbb.fa")
        with open(sub_fasta2, "w") as rh:
            rh.write(self.example.sub_fasta2)
        sub_gff1 = os.path.join(tmp_tar, "aaa.gff")
        with open(sub_gff1, "w") as rh:
            rh.write(self.example.sub_gff1)
        sub_gff2 = os.path.join(tmp_tar, "bbb.gff")
        with open(sub_gff2, "w") as rh:
            rh.write(self.example.sub_gff2)
        self.multiparser.combine_gff(self.ref_folder, tmp_tar, "fasta", None)
        self.assertTrue(os.path.exists(os.path.join(tmp_tar, "test.gff")))

    def test_parser_fasta(self):
        fasta_file = os.path.join(self.ref_folder, "test.fa")
        with open(fasta_file, "w") as rh:
            rh.write(self.example.fasta_file)
        self.multiparser.parser_fasta(self.ref_folder)
        self.assertTrue(
            os.path.exists(os.path.join(self.ref_folder, "tmp/aaa.fa")))
        self.assertTrue(
            os.path.exists(os.path.join(self.ref_folder, "tmp/bbb.fa")))
        self.assertTrue(
            os.path.exists(
                os.path.join(self.ref_folder, "test.fa_folder/aaa.fa")))
        self.assertTrue(
            os.path.exists(
                os.path.join(self.ref_folder, "test.fa_folder/bbb.fa")))

    def test_parser_gff(self):
        gff_file = os.path.join(self.ref_folder, "test.gff")
        with open(gff_file, "w") as rh:
            rh.write(self.example.gff_file)
        self.multiparser.parser_gff(self.ref_folder, None)
        self.assertTrue(
            os.path.exists(os.path.join(self.ref_folder, "tmp/aaa.gff")))
        self.assertTrue(
            os.path.exists(os.path.join(self.ref_folder, "tmp/bbb.gff")))
        self.assertTrue(
            os.path.exists(
                os.path.join(self.ref_folder, "test.gff_folder/aaa.gff")))
        self.assertTrue(
            os.path.exists(
                os.path.join(self.ref_folder, "test.gff_folder/bbb.gff")))
        tss_file = os.path.join(self.ref_folder, "test_TSS.gff")
        os.rename(gff_file, tss_file)
        tss_file = os.path.join(self.ref_folder, "test_TSS.gff")
        with open(tss_file, "w") as rh:
            rh.write(self.example.gff_file)
        self.multiparser.parser_gff(self.ref_folder, "TSS")
        self.assertTrue(
            os.path.exists(os.path.join(self.ref_folder, "tmp/aaa_TSS.gff")))
        self.assertTrue(
            os.path.exists(os.path.join(self.ref_folder, "tmp/bbb_TSS.gff")))
        self.assertTrue(
            os.path.exists(
                os.path.join(self.ref_folder,
                             "test_TSS.gff_folder/aaa_TSS.gff")))
        self.assertTrue(
            os.path.exists(
                os.path.join(self.ref_folder,
                             "test_TSS.gff_folder/bbb_TSS.gff")))

    def test_parser_wig(self):
        wig_f_file = os.path.join(self.ref_folder, "test_forward.wig")
        with open(wig_f_file, "w") as rh:
            rh.write(self.example.wig_f_file)
        wig_r_file = os.path.join(self.ref_folder, "test_reverse.wig")
        with open(wig_r_file, "w") as rh:
            rh.write(self.example.wig_r_file)
        self.multiparser.parser_wig(self.ref_folder)
        self.assertTrue(
            os.path.exists(
                os.path.join(self.ref_folder,
                             "tmp/test_forward_STRAIN_aaa.wig")))
        self.assertTrue(
            os.path.exists(
                os.path.join(self.ref_folder,
                             "tmp/test_forward_STRAIN_bbb.wig")))
        self.assertTrue(
            os.path.exists(
                os.path.join(self.ref_folder,
                             "tmp/test_reverse_STRAIN_aaa.wig")))
        self.assertTrue(
            os.path.exists(
                os.path.join(self.ref_folder,
                             "tmp/test_reverse_STRAIN_bbb.wig")))
        self.assertTrue(
            os.path.exists(
                os.path.join(
                    self.ref_folder,
                    "test_forward.wig_folder/test_forward_STRAIN_aaa.wig")))
        self.assertTrue(
            os.path.exists(
                os.path.join(
                    self.ref_folder,
                    "test_forward.wig_folder/test_forward_STRAIN_bbb.wig")))
        self.assertTrue(
            os.path.exists(
                os.path.join(
                    self.ref_folder,
                    "test_reverse.wig_folder/test_reverse_STRAIN_aaa.wig")))
        self.assertTrue(
            os.path.exists(
                os.path.join(
                    self.ref_folder,
                    "test_reverse.wig_folder/test_reverse_STRAIN_bbb.wig")))
示例#6
0
class sRNADetection(object):
    '''detection of sRNA'''
    def __init__(self, args_srna):
        self.args_container = ArgsContainer()
        self.helper = Helper()
        self.multiparser = Multiparser()
        self.gff_output = os.path.join(args_srna.out_folder, "gffs")
        self.table_output = os.path.join(args_srna.out_folder, "tables")
        self.stat_path = os.path.join(args_srna.out_folder, "statistics")
        self.tss_path = self._check_folder_exist(args_srna.tss_folder)
        self.pro_path = self._check_folder_exist(args_srna.pro_folder)
        self.sorf_path = self._check_folder_exist(args_srna.sorf_file)
        self.fasta_path = os.path.join(args_srna.fastas, "tmp")
        self.tran_path = os.path.join(args_srna.trans, "tmp")
        self.term_path = self._check_folder_exist(args_srna.terms)
        self.merge_wigs = os.path.join(args_srna.out_folder, "merge_wigs")
        self.prefixs = {
            "merge": os.path.join(args_srna.out_folder, "tmp_merge"),
            "utr": os.path.join(args_srna.out_folder, "tmp_utrsrna"),
            "normal": os.path.join(args_srna.out_folder, "tmp_normal"),
            "in_cds": os.path.join(args_srna.out_folder, "tmp_incds"),
            "merge_table": os.path.join(args_srna.out_folder,
                                        "tmp_merge_table"),
            "utr_table": os.path.join(args_srna.out_folder,
                                      "tmp_utrsrna_table"),
            "normal_table": os.path.join(args_srna.out_folder,
                                         "tmp_normal_table"),
            "in_cds_table": os.path.join(args_srna.out_folder,
                                         "tmp_incds_table"),
            "basic": os.path.join(args_srna.out_folder, "tmp_basic"),
            "energy": os.path.join(args_srna.out_folder, "tmp_energy")
        }
        self.tmps = {
            "nr": os.path.join(args_srna.out_folder, "tmp_nr"),
            "srna": os.path.join(args_srna.out_folder, "tmp_sRNA")
        }
        self.best_table = os.path.join(self.table_output, "best")
        self.table_output = os.path.join(args_srna.out_folder, "tables")
        self.stat_path = os.path.join(args_srna.out_folder, "statistics")
        self.all_best = {
            "all_gff": os.path.join(self.gff_output, "all_candidates"),
            "best_gff": os.path.join(self.gff_output, "best"),
            "all_table": os.path.join(self.table_output, "all_candidates"),
            "best_table": os.path.join(self.table_output, "best")
        }

    def _check_folder_exist(self, folder):
        if folder is not None:
            path = os.path.join(folder, "tmp")
        else:
            path = None
        return path

    def _check_gff(self, gffs):
        for gff in os.listdir(gffs):
            if gff.endswith(".gff"):
                self.helper.check_uni_attributes(os.path.join(gffs, gff))

    def _run_format(self, blast_path, database, type_, db_file, err):
        call([
            os.path.join(blast_path, "makeblastdb"), "-in", database,
            "-dbtype", type_, "-out", db_file
        ],
             stderr=err)

    def _formatdb(self, database, type_, out_folder, blast_path,
                  database_type):
        err = open(os.path.join(out_folder, "log.txt"), "w")
        if (database.endswith(".fa")) or (database.endswith(".fna")) or (
                database.endswith(".fasta")):
            pass
        else:
            folders = database.split("/")
            filename = folders[-1]
            folder = "/".join(folders[:-1])
            for fasta in os.listdir(folder):
                if (fasta.endswith(".fa")) or (fasta.endswith(".fna")) or (
                        fasta.endswith(".fasta")):
                    if ".".join(fasta.split(".")[:-1]) == filename:
                        database = os.path.join(folder, fasta)
        if database_type == "sRNA":
            change_format(database, "tmp_srna_database")
            os.remove(database)
            shutil.move("tmp_srna_database", database)
        db_file = ".".join(database.split(".")[:-1])
        self._run_format(blast_path, database, type_, db_file, err)
        err.close()

    def _merge_frag_tex_file(self, files, args_srna):
        '''merge the results of fragmented and tex treated libs'''
        if (args_srna.frag_wigs is not None) and (args_srna.tex_wigs
                                                  is not None):
            self.helper.merge_file(files["frag_gff"], files["tex_gff"])
            self.helper.merge_file(files["frag_csv"], files["tex_csv"])
            shutil.move(files["tex_csv"], files["merge_csv"])
            self.helper.sort_gff(files["tex_gff"], files["merge_gff"])
            os.remove(files["frag_csv"])
            os.remove(files["frag_gff"])
            os.remove(files["tex_gff"])
        elif (args_srna.frag_wigs is not None):
            shutil.move(files["frag_csv"], files["merge_csv"])
            self.helper.sort_gff(files["frag_gff"], files["merge_gff"])
            os.remove(files["frag_gff"])
        elif (args_srna.tex_wigs is not None):
            shutil.move(files["tex_csv"], files["merge_csv"])
            self.helper.sort_gff(files["tex_gff"], files["merge_gff"])

    def _read_lib_wig(self, args_srna):
        libs, texs = read_libs(args_srna.input_libs, args_srna.wig_folder)
        wigs_f = read_wig(args_srna.wig_f_file, "+", libs)
        wigs_r = read_wig(args_srna.wig_r_file, "-", libs)
        return [libs, texs, wigs_f, wigs_r]

    def _run_normal(self, prefix, gff, tran, fuzzy_tss, args_srna):
        '''detection of intergenic and antisense sRNA'''
        tex_datas = None
        frag_datas = None
        if "tmp_cutoff_inter" in os.listdir(args_srna.out_folder):
            os.remove(os.path.join(args_srna.out_folder, "tmp_cutoff_inter"))
        files = {
            "frag_gff": None,
            "frag_csv": None,
            "tex_gff": None,
            "tex_csv": None,
            "merge_gff": None,
            "merge_csv": None
        }
        if self.tss_path is not None:
            tss = self.helper.get_correct_file(self.tss_path, "_TSS.gff",
                                               prefix, None, None)
        else:
            tss = None
        if self.pro_path is not None:
            pro = self.helper.get_correct_file(self.pro_path,
                                               "_processing.gff", prefix, None,
                                               None)
        else:
            pro = None
        if args_srna.frag_wigs is not None:
            files["frag_gff"] = os.path.join(args_srna.out_folder,
                                             "_".join(["tmp_frag", prefix]))
            files["frag_csv"] = os.path.join(
                args_srna.out_folder, "_".join(["tmp_frag_table", prefix]))
            args_srna = self.args_container.container_intersrna(
                "frag", files, args_srna, prefix,
                os.path.join(args_srna.gffs, gff), tran, tss, pro, fuzzy_tss)
            frag_datas = self._read_lib_wig(args_srna)
            intergenic_srna(args_srna, frag_datas[0], frag_datas[1],
                            frag_datas[2], frag_datas[3])
        if args_srna.tex_wigs is not None:
            files["tex_gff"] = os.path.join(args_srna.out_folder,
                                            "_".join(["tmp_tex", prefix]))
            files["tex_csv"] = os.path.join(
                args_srna.out_folder, "_".join(["tmp_tex_table", prefix]))
            args_srna = self.args_container.container_intersrna(
                "tex", files, args_srna, prefix,
                os.path.join(args_srna.gffs, gff), tran, tss, pro, fuzzy_tss)
            tex_datas = self._read_lib_wig(args_srna)
            intergenic_srna(args_srna, tex_datas[0], tex_datas[1],
                            tex_datas[2], tex_datas[3])
        files["merge_csv"] = "_".join([self.prefixs["normal_table"], prefix])
        files["merge_gff"] = "_".join([self.prefixs["normal"], prefix])
        self._merge_frag_tex_file(files, args_srna)
        if ("TSS_class" in os.listdir(
                args_srna.out_folder)) and (not args_srna.tss_source):
            tss = os.path.join(args_srna.out_folder, "TSS_class",
                               prefix + "_TSS.gff")
        return tss, frag_datas, tex_datas

    def _run_utrsrna(self, gff, tran, prefix, tss, pro, args_srna, frag_datas,
                     tex_datas):
        '''detection of UTR-derived sRNA'''
        if "tmp_median" in os.listdir(args_srna.out_folder):
            os.remove(os.path.join(args_srna.out_folder, "tmp_median"))
        files = {
            "frag_gff": None,
            "frag_csv": None,
            "tex_gff": None,
            "tex_csv": None,
            "merge_gff": None,
            "merge_csv": None
        }
        if args_srna.tex_wigs is not None:
            files["tex_gff"] = os.path.join(args_srna.out_folder,
                                            "_".join(["tmp_utr_tex", prefix]))
            files["tex_csv"] = os.path.join(
                args_srna.out_folder, "_".join(["tmp_utr_tex_table", prefix]))
            args_srna = self.args_container.container_utrsrna(
                os.path.join(args_srna.gffs, gff), tran, tss, files, pro,
                os.path.join(self.fasta_path, prefix + ".fa"), "tex", prefix,
                args_srna)
            utr_derived_srna(args_srna, tex_datas[0], tex_datas[1],
                             tex_datas[2], tex_datas[3])
        if args_srna.frag_wigs is not None:
            files["frag_gff"] = os.path.join(
                args_srna.out_folder, "_".join(["tmp_utr_frag", prefix]))
            files["frag_csv"] = os.path.join(
                args_srna.out_folder, "_".join(["tmp_utr_frag_table", prefix]))
            args_srna = self.args_container.container_utrsrna(
                os.path.join(args_srna.gffs, gff), tran, tss, files, pro,
                os.path.join(self.fasta_path, prefix + ".fa"), "frag", prefix,
                args_srna)
            utr_derived_srna(args_srna, frag_datas[0], frag_datas[1],
                             frag_datas[2], frag_datas[3])
        files["merge_csv"] = "_".join([self.prefixs["utr_table"], prefix])
        files["merge_gff"] = "_".join([self.prefixs["utr"], prefix])
        self._merge_frag_tex_file(files, args_srna)
        filter_utr(files["merge_gff"], files["merge_csv"], args_srna.min_utr)

    def _check_necessary_file(self, args_srna):
        if (args_srna.gffs is None) or (args_srna.trans is None) or (
            (args_srna.tex_wigs is None) and (args_srna.frag_wigs is None)):
            print("Error: lack required files!!!!")
            sys.exit()
        if args_srna.utr_srna:
            if (args_srna.tss_folder is None):
                print("Error: lack required TSS files for UTR "
                      "derived sRNA detection!!!!")
                sys.exit()
            if (args_srna.pro_folder is None):
                print("Warning: lack Processing site files for UTR "
                      "derived sRNA detection!!!")
                print("it may effect the results!!!!")
        self._check_gff(args_srna.gffs)
        self._check_gff(args_srna.trans)
        if args_srna.tss_folder is not None:
            self._check_gff(args_srna.tss_folder)
            self.multiparser.parser_gff(args_srna.tss_folder, "TSS")
            self.multiparser.combine_gff(args_srna.gffs, self.tss_path, None,
                                         "TSS")
        if args_srna.pro_folder is not None:
            self._check_gff(args_srna.pro_folder)
            self.multiparser.parser_gff(args_srna.pro_folder, "processing")
            self.multiparser.combine_gff(args_srna.gffs, self.pro_path, None,
                                         "processing")
        if args_srna.sorf_file is not None:
            self._check_gff(args_srna.sorf_file)
            self.multiparser.parser_gff(args_srna.sorf_file, "sORF")
            self.multiparser.combine_gff(args_srna.gffs, self.sorf_path, None,
                                         "sORF")
        if args_srna.import_info is not None:
            if args_srna.utr_srna or ("sec_str" in args_srna.import_info) or (
                    args_srna.nr_database
                    is not None) or (args_srna.srna_database is not None):
                if args_srna.fastas is None:
                    print("Error: lack required fasta files for UTR "
                          "derived sRNA detection!!!!")
                    sys.exit()
                self.multiparser.parser_fasta(args_srna.fastas)
                self.multiparser.combine_fasta(args_srna.gffs, self.fasta_path,
                                               None)
        if args_srna.terms is not None:
            self._check_gff(args_srna.terms)
            self.multiparser.parser_gff(args_srna.terms, "term")
            self.multiparser.combine_gff(args_srna.gffs, self.term_path, None,
                                         "term")
        else:
            self.term_path = None

    def _merge_tex_frag_datas(self, tex_datas, frag_datas):
        if (tex_datas is not None) and (frag_datas is not None):
            for index in [2, 3]:
                for strain, conds in frag_datas[index].items():
                    if strain not in tex_datas[index].keys():
                        tex_datas[index][strain] = conds
                    else:
                        for cond, tracks in conds.items():
                            tex_datas[index][strain][cond] = tracks
        elif (tex_datas is None) and (frag_datas is not None):
            tex_datas = frag_datas
        return tex_datas

    def _run_program(self, args_srna):
        prefixs = []
        tss = None
        for gff in os.listdir(args_srna.gffs):
            if gff.endswith(".gff"):
                prefix = gff.replace(".gff", "")
                prefixs.append(prefix)
                print("Running sRNA detection of {0}....".format(prefix))
                tran = self.helper.get_correct_file(self.tran_path,
                                                    "_transcript.gff", prefix,
                                                    None, None)
                gffs = {
                    "merge": "_".join([self.prefixs["merge"], prefix]),
                    "utr": "_".join([self.prefixs["utr"], prefix]),
                    "normal": "_".join([self.prefixs["normal"], prefix])
                }
                csvs = {
                    "merge": "_".join([self.prefixs["merge_table"], prefix]),
                    "utr": "_".join([self.prefixs["utr_table"], prefix]),
                    "normal": "_".join([self.prefixs["normal_table"], prefix])
                }
                tss, frag_datas, tex_datas = self._run_normal(
                    prefix, gff, tran, args_srna.fuzzy_tsss["inter"],
                    args_srna)
                if args_srna.utr_srna:
                    print("Running UTR derived sRNA detection of {0}".format(
                        prefix))
                    if tss is None:
                        tss = self.helper.get_correct_file(
                            self.tss_path, "_TSS.gff", prefix, None, None)
                    if self.pro_path is not None:
                        pro = self.helper.get_correct_file(
                            self.pro_path, "_processing.gff", prefix, None,
                            None)
                    else:
                        pro = None
                    if tss is not None:
                        self._run_utrsrna(gff, tran, prefix, tss, pro,
                                          args_srna, frag_datas, tex_datas)
                tex_datas = self._merge_tex_frag_datas(tex_datas, frag_datas)
                del frag_datas
                gc.collect()
                self._merge_srna(args_srna, gffs, csvs, prefix,
                                 os.path.join(args_srna.gffs, gff), tss,
                                 tex_datas)
                del tex_datas
                filter_frag(csvs["merge"], gffs["merge"])
                self.helper.sort_gff(gffs["merge"],
                                     "_".join([self.prefixs["basic"], prefix]))
        return prefixs

    def _merge_srna(self, args_srna, gffs, csvs, prefix, gff_file, tss,
                    tex_datas):
        print("merging data of sRNA...")
        merge_srna_gff(gffs, args_srna.in_cds, args_srna.cutoff_overlap,
                       gff_file)
        merge_srna_table(gffs["merge"], csvs, tex_datas[2], tex_datas[3], tss,
                         args_srna)

    def _run_RNAfold(self, seq_file, vienna_path, sec_file):
        os.system(" ".join([
            "cat", seq_file, "|",
            os.path.join(vienna_path, "RNAfold"), "-p", ">", sec_file
        ]))

    def _get_seq_sec(self, fasta_path, out_folder, prefix, sec_path, dot_path,
                     vienna_path):
        '''extract the sec str energy'''
        detect = False
        for fasta in os.listdir(fasta_path):
            if fasta.endswith(".fa") and (fasta.replace(".fa", "") == prefix):
                detect = True
                break
        if detect:
            detect = False
            seq_file = os.path.join(out_folder, "_".join(["sRNA_seq", prefix]))
            sec_file = os.path.join(out_folder, "_".join(["sRNA_2d", prefix]))
            self.helper.get_seq("_".join([self.prefixs["basic"], prefix]),
                                os.path.join(fasta_path, fasta), seq_file)
        else:
            print("Error:There is not fasta file of {0}".format(prefix))
            print("please check your imported information")
            sys.exit()
        tmp_path = os.path.join(out_folder, "tmp_srna")
        self.helper.check_make_folder(tmp_path)
        main_path = os.getcwd()
        os.chdir(tmp_path)
        sec_file = os.path.join(main_path, sec_file)
        seq_file = os.path.join(main_path, seq_file)
        tmp_sec_path = os.path.join(main_path, sec_path)
        tmp_dot_path = os.path.join(main_path, dot_path)
        self._run_RNAfold(seq_file, vienna_path, sec_file)
        extract_energy(
            os.path.join(main_path, "_".join([self.prefixs["basic"], prefix])),
            sec_file,
            os.path.join(main_path, "_".join([self.prefixs["energy"],
                                              prefix])))
        for ps in os.listdir(os.getcwd()):
            new_ps = ps.replace("|", "_")
            shutil.move(ps, new_ps)
        return {
            "sec": tmp_sec_path,
            "dot": tmp_dot_path,
            "main": main_path,
            "tmp": os.path.join(main_path, tmp_path)
        }

    def _run_replot(self, vienna_util, tmp_paths, file_, dot_file, rel_file):
        os.system(" ".join([
            os.path.join(vienna_util, "relplot.pl"),
            os.path.join(tmp_paths["tmp"], file_),
            os.path.join(tmp_paths["tmp"], dot_file), ">",
            os.path.join(tmp_paths["tmp"], rel_file)
        ]))

    def _convert_pdf(self, ps2pdf14_path, tmp_paths, file_, pdf_file):
        call([ps2pdf14_path, os.path.join(tmp_paths["tmp"], file_), pdf_file])

    def _replot_sec_to_pdf(self, vienna_util, tmp_paths, ps2pdf14_path,
                           prefix):
        for file_ in os.listdir(os.getcwd()):
            if file_.endswith("ss.ps"):
                dot_file = file_.replace("ss.ps", "dp.ps")
                rel_file = file_.replace("ss.ps", "rss.ps")
                print("replot {0}".format(file_))
                self._run_replot(vienna_util, tmp_paths, file_, dot_file,
                                 rel_file)
        for file_ in os.listdir(tmp_paths["tmp"]):
            if (file_.endswith("rss.ps")) or (file_.endswith("dp.ps")):
                pdf_file = file_.replace(".ps", ".pdf")
                print("convert {0} to pdf".format(file_))
                self._convert_pdf(ps2pdf14_path, tmp_paths, file_, pdf_file)
        os.mkdir(os.path.join(tmp_paths["sec"], prefix))
        os.mkdir(os.path.join(tmp_paths["dot"], prefix))
        self.helper.move_all_content(tmp_paths["tmp"],
                                     os.path.join(tmp_paths["sec"], prefix),
                                     ["rss.pdf"])
        self.helper.move_all_content(tmp_paths["tmp"],
                                     os.path.join(tmp_paths["dot"], prefix),
                                     ["dp.pdf"])

    def _run_mountain(self, vienna_util, tmp_paths, dot_file, out):
        call([
            os.path.join(vienna_util, "mountain.pl"),
            os.path.join(tmp_paths["tmp"], dot_file)
        ],
             stdout=out)

    def _plot_mountain(self, mountain, moun_path, tmp_paths, prefix,
                       vienna_util):
        if mountain:
            tmp_moun_path = os.path.join(tmp_paths["main"], moun_path)
            os.mkdir(os.path.join(tmp_moun_path, prefix))
            txt_path = os.path.join(tmp_paths["tmp"], "tmp_txt")
            self.helper.check_make_folder(txt_path)
            print("Generating mountain plot of {0}....".format(prefix))
            for dot_file in os.listdir(tmp_paths["tmp"]):
                if dot_file.endswith("dp.ps"):
                    moun_txt = os.path.join(tmp_paths["tmp"], "mountain.txt")
                    out = open(moun_txt, "w")
                    moun_file = dot_file.replace("dp.ps", "mountain.pdf")
                    print("Generating {0}".format(moun_file))
                    self._run_mountain(vienna_util, tmp_paths, dot_file, out)
                    plot_mountain_plot(moun_txt, moun_file)
                    shutil.move(moun_file,
                                os.path.join(tmp_moun_path, prefix, moun_file))
                    out.close()
                    os.remove(moun_txt)

    def _compute_2d_and_energy(self, args_srna, prefixs):
        print("Running energy calculation....")
        moun_path = os.path.join(args_srna.out_folder, "mountain_plot")
        sec_path = os.path.join(args_srna.out_folder, "sec_structure",
                                "sec_plot")
        dot_path = os.path.join(args_srna.out_folder, "sec_structure",
                                "dot_plot")
        self.helper.remove_all_content(sec_path, None, "dir")
        self.helper.remove_all_content(dot_path, None, "dir")
        self.helper.remove_all_content(moun_path, None, "dir")
        for prefix in prefixs:
            tmp_paths = self._get_seq_sec(self.fasta_path,
                                          args_srna.out_folder, prefix,
                                          sec_path, dot_path,
                                          args_srna.vienna_path)
            self._replot_sec_to_pdf(args_srna.vienna_util, tmp_paths,
                                    args_srna.ps2pdf14_path, prefix)
            self._plot_mountain(args_srna.mountain, moun_path, tmp_paths,
                                prefix, args_srna.vienna_util)
            self.helper.remove_all_content(os.getcwd(), ".ps", "file")
            os.chdir(tmp_paths["main"])
            shutil.move("_".join([self.prefixs["energy"], prefix]),
                        "_".join([self.prefixs["basic"], prefix]))
            shutil.rmtree(os.path.join(args_srna.out_folder, "tmp_srna"))

    def _run_blast(self, blast_path, program, database, e, seq_file,
                   blast_file, strand):
        call([
            os.path.join(blast_path, program), "-db", database, "-evalue",
            str(e), "-strand", strand, "-query", seq_file, "-out", blast_file
        ])

    def _get_strand_fasta(self, seq_file, out_folder):
        tmp_plus = os.path.join(out_folder, "tmp_plus.fa")
        tmp_minus = os.path.join(out_folder, "tmp_minus.fa")
        out_p = open(tmp_plus, "w")
        out_m = open(tmp_minus, "w")
        strand = ""
        with open(seq_file) as sh:
            for line in sh:
                line = line.strip()
                if line.startswith(">"):
                    if line[-1] == "+":
                        out_p.write(line + "\n")
                        strand = "plus"
                    elif line[-1] == "-":
                        out_m.write(line + "\n")
                        strand = "minus"
                else:
                    if strand == "plus":
                        out_p.write(line + "\n")
                    elif strand == "minus":
                        out_m.write(line + "\n")
        out_p.close()
        out_m.close()
        return tmp_plus, tmp_minus

    def _blast(self, database, database_format, data_type, args_srna, prefixs,
               program, database_type, e):
        if (database is None):
            print("Error: No database assigned!")
        else:
            if database_format:
                self._formatdb(database, data_type, args_srna.out_folder,
                               args_srna.blast_path, database_type)
            for prefix in prefixs:
                blast_file = os.path.join(
                    args_srna.out_folder, "blast_result_and_misc",
                    "_".join([database_type, "blast", prefix + ".txt"]))
                srna_file = "_".join([self.prefixs["basic"], prefix])
                out_file = os.path.join(
                    args_srna.out_folder,
                    "_".join(["tmp", database_type, prefix]))
                print("Running Blast of {0} in {1}".format(prefix, database))
                seq_file = os.path.join(args_srna.out_folder,
                                        "_".join(["sRNA_seq", prefix]))
                if seq_file not in os.listdir(args_srna.out_folder):
                    self.helper.get_seq(
                        srna_file, os.path.join(self.fasta_path,
                                                prefix + ".fa"), seq_file)
                if database_type == "nr":
                    tmp_plus, tmp_minus = self._get_strand_fasta(
                        seq_file, args_srna.out_folder)
                    tmp_blast = os.path.join("tmp_blast.txt")
                    self._run_blast(args_srna.blast_path, program, database, e,
                                    tmp_plus, tmp_blast, "plus")
                    self._run_blast(args_srna.blast_path, program, database, e,
                                    tmp_minus, blast_file, "minus")
                    self.helper.merge_file(tmp_blast, blast_file)
                    os.remove(tmp_blast)
                    os.remove(tmp_plus)
                    os.remove(tmp_minus)
                else:
                    self._run_blast(args_srna.blast_path, program, database, e,
                                    seq_file, blast_file, "both")
                extract_blast(blast_file, srna_file, out_file,
                              out_file + ".csv", database_type)
                shutil.move(out_file, srna_file)

    def _class_srna(self, prefixs, args_srna):
        '''classify the sRNA based on the filters'''
        if (args_srna.import_info
                is not None) or (args_srna.srna_database is not None) or (
                    args_srna.nr_database
                    is not None) or (self.sorf_path is not None) or (
                        self.tss_path
                        is not None) or (self.term_path is not None) or (
                            args_srna.promoter_table is not None):
            for prefix in prefixs:
                print("classifying sRNA of {0}".format(prefix))
                class_gff = os.path.join(self.gff_output, "for_class")
                class_table = os.path.join(self.table_output, "for_class")
                self.helper.check_make_folder(os.path.join(
                    class_table, prefix))
                self.helper.check_make_folder(os.path.join(class_gff, prefix))
                class_gff = os.path.join(class_gff, prefix)
                class_table = os.path.join(class_table, prefix)
                self.helper.check_make_folder(class_table)
                self.helper.check_make_folder(class_gff)
                out_stat = os.path.join(
                    self.stat_path,
                    "_".join(["stat_sRNA_class", prefix + ".csv"]))
                classify_srna(
                    os.path.join(self.all_best["all_gff"],
                                 "_".join([prefix, "sRNA.gff"])), class_gff,
                    out_stat, args_srna)
                for srna in os.listdir(class_gff):
                    out_table = os.path.join(class_table,
                                             srna.replace(".gff", ".csv"))
                    gen_srna_table(
                        os.path.join(class_gff, srna),
                        "_".join([self.prefixs["merge_table"], prefix]),
                        "_".join([self.tmps["nr"], prefix + ".csv"]),
                        "_".join([self.tmps["srna"], prefix + ".csv"]),
                        args_srna, out_table, self.term_path)

    def _get_best_result(self, prefixs, args_srna):
        '''get the best results based on the filters'''
        for prefix in prefixs:
            best_gff = os.path.join(self.all_best["best_gff"],
                                    "_".join([prefix, "sRNA.gff"]))
            best_table = os.path.join(self.all_best["best_table"],
                                      "_".join([prefix, "sRNA.csv"]))
            gen_best_srna(
                os.path.join(self.all_best["all_gff"],
                             "_".join([prefix, "sRNA.gff"])), best_gff,
                args_srna)
            gen_srna_table(
                os.path.join(self.all_best["best_gff"],
                             "_".join([prefix, "sRNA.gff"])),
                "_".join([self.prefixs["merge_table"], prefix]),
                "_".join([self.tmps["nr"], prefix + ".csv"]),
                "_".join([self.tmps["srna"], prefix + ".csv"]), args_srna,
                best_table, self.term_path)

    def _remove_file(self, args_srna):
        self.helper.remove_all_content(args_srna.out_folder, "tmp_", "dir")
        self.helper.remove_all_content(args_srna.out_folder, "tmp_", "file")
        self.helper.remove_tmp(args_srna.fastas)
        self.helper.remove_tmp(args_srna.gffs)
        self.helper.remove_tmp(self.gff_output)
        if args_srna.frag_wigs is not None:
            self.helper.remove_tmp(args_srna.frag_wigs)
        if args_srna.tex_wigs is not None:
            self.helper.remove_tmp(args_srna.tex_wigs)
        if (args_srna.frag_wigs is not None) and (args_srna.tex_wigs
                                                  is not None):
            shutil.rmtree(args_srna.merge_wigs)
        self.helper.remove_tmp(args_srna.trans)
        if args_srna.tss_folder is not None:
            self.helper.remove_tmp(args_srna.tss_folder)
        if args_srna.pro_folder is not None:
            self.helper.remove_tmp(args_srna.pro_folder)
        if args_srna.sorf_file is not None:
            self.helper.remove_tmp(args_srna.sorf_file)
        if "tmp_median" in os.listdir(args_srna.out_folder):
            os.remove(os.path.join(args_srna.out_folder, "tmp_median"))
        if self.term_path is not None:
            self.helper.remove_tmp(args_srna.terms)

    def _filter_srna(self, args_srna, prefixs):
        '''set the filter of sRNA'''
        if args_srna.import_info is not None:
            if "sec_str" in args_srna.import_info:
                self._compute_2d_and_energy(args_srna, prefixs)
        if args_srna.nr_database is not None:
            self._blast(args_srna.nr_database, args_srna.nr_format, "prot",
                        args_srna, prefixs, "blastx", "nr", args_srna.e_nr)
        if self.sorf_path is not None:
            for prefix in prefixs:
                if ("_".join([prefix, "sORF.gff"])
                        in os.listdir(self.sorf_path)):
                    tmp_srna = os.path.join(args_srna.out_folder,
                                            "".join(["tmp_srna_sorf", prefix]))
                    tmp_sorf = os.path.join(args_srna.out_folder,
                                            "".join(["tmp_sorf_srna", prefix]))
                    srna_sorf_comparison(
                        "_".join([self.prefixs["basic"], prefix]),
                        os.path.join(self.sorf_path,
                                     "_".join([prefix, "sORF.gff"])), tmp_srna,
                        tmp_sorf)
                    os.remove(tmp_sorf)
                    shutil.move(tmp_srna,
                                "_".join([self.prefixs["basic"], prefix]))
        if args_srna.srna_database is not None:
            self._blast(args_srna.srna_database, args_srna.srna_format, "nucl",
                        args_srna, prefixs, "blastn", "sRNA", args_srna.e_srna)

    def _import_info_format(self, import_info):
        new_info = []
        for info in import_info:
            info = info.lower()
            new_info.append(info)
        return new_info

    def _gen_table(self, prefixs, args_srna):
        for prefix in prefixs:
            out_table = os.path.join(self.all_best["all_table"],
                                     "_".join([prefix, "sRNA.csv"]))
            gen_srna_table(
                os.path.join(self.all_best["all_gff"],
                             "_".join([prefix, "sRNA.gff"])),
                "_".join([self.prefixs["merge_table"], prefix]),
                "_".join([self.tmps["nr"], prefix + ".csv"]),
                "_".join([self.tmps["srna"], prefix + ".csv"]), args_srna,
                out_table, self.term_path)

    def _print_rank_all(self, prefixs):
        for prefix in prefixs:
            all_table = os.path.join(self.all_best["all_table"],
                                     "_".join([prefix, "sRNA.csv"]))
            best_table = os.path.join(self.all_best["best_table"],
                                      "_".join([prefix, "sRNA.csv"]))
            print_rank_all(all_table, best_table)

    def _filter_min_utr(self, prefixs, min_utr):
        '''filter out the low expressed UTR-derived sRNA'''
        for prefix in prefixs:
            filter_utr(
                os.path.join(self.all_best["all_gff"],
                             "_".join([prefix, "sRNA.gff"])),
                os.path.join(self.all_best["all_table"],
                             "_".join([prefix, "sRNA.csv"])), min_utr)

    def _antisense(self, gffs, prefixs):
        '''detection of antisense'''
        for prefix in prefixs:
            all_table = os.path.join(self.all_best["all_table"],
                                     "_".join([prefix, "sRNA.csv"]))
            best_table = os.path.join(self.all_best["best_table"],
                                      "_".join([prefix, "sRNA.csv"]))
            all_gff = os.path.join(self.all_best["all_gff"],
                                   "_".join([prefix, "sRNA.gff"]))
            best_gff = os.path.join(self.all_best["best_gff"],
                                    "_".join([prefix, "sRNA.gff"]))
            srna_antisense(all_gff, all_table,
                           os.path.join(gffs, prefix + ".gff"))
            srna_antisense(best_gff, best_table,
                           os.path.join(gffs, prefix + ".gff"))

    def _blast_stat(self, stat_path, srna_tables):
        '''do statistics for blast result'''
        for srna_table in os.listdir(os.path.join(srna_tables, "best")):
            out_srna_blast = os.path.join(
                stat_path, "stat_" + srna_table.replace(".csv", "_blast.csv"))
        blast_class(os.path.join(srna_tables, "best", srna_table),
                    out_srna_blast)

    def _compare_term_promoter(self, out_table, prefix, args_srna):
        '''compare sRNA with terminator and promoter'''
        if self.term_path is not None:
            compare_srna_term(
                os.path.join(self.all_best["all_gff"],
                             "_".join([prefix, "sRNA.gff"])), out_table,
                os.path.join(self.term_path, "_".join([prefix, "term.gff"])),
                args_srna.fuzzy_b, args_srna.fuzzy_a)
        if (args_srna.promoter_table is not None):
            compare_srna_promoter(
                os.path.join(self.all_best["all_gff"],
                             "_".join([prefix, "sRNA.gff"])), out_table,
                args_srna)

    def run_srna_detection(self, args_srna):
        self._check_necessary_file(args_srna)
        self.multiparser.parser_gff(args_srna.trans, "transcript")
        self.multiparser.combine_gff(args_srna.gffs, self.tran_path, None,
                                     "transcript")
        if args_srna.import_info is not None:
            args_srna.import_info = self._import_info_format(
                args_srna.import_info)
        prefixs = self._run_program(args_srna)
        self._filter_srna(args_srna, prefixs)
        for prefix in prefixs:
            shutil.copyfile(
                "_".join([self.prefixs["basic"], prefix]),
                os.path.join(self.all_best["all_gff"],
                             "_".join([prefix, "sRNA.gff"])))
            self._compare_term_promoter(
                "_".join([self.prefixs["merge_table"], prefix]), prefix,
                args_srna)
        self._gen_table(prefixs, args_srna)
        self._class_srna(prefixs, args_srna)
        self._get_best_result(prefixs, args_srna)
        self._print_rank_all(prefixs)
        if args_srna.srna_database is not None:
            if "blast_srna" in args_srna.import_info:
                self._blast_stat(self.stat_path, self.table_output)
        self._remove_file(args_srna)
class TestMultiparser(unittest.TestCase):

    def setUp(self):
        self.multiparser = Multiparser()
        self.example = Example()
        self.ref_folder = "ref_folder"
        if (not os.path.exists(self.ref_folder)):
            os.mkdir(self.ref_folder)
        self.tar_folder = "tar_folder"
        if (not os.path.exists(self.tar_folder)):
            os.mkdir(self.tar_folder)

    def tearDown(self):
        if os.path.exists(self.ref_folder):
            shutil.rmtree(self.ref_folder)
        if os.path.exists(self.tar_folder):
            shutil.rmtree(self.tar_folder)

    def test_combine_fasta(self):
        tmp_tar = os.path.join(self.tar_folder, "tmp")
        tmp_ref = os.path.join(self.ref_folder, "test.gff_folder")
        os.mkdir(tmp_ref)
        os.mkdir(tmp_tar)
        sub_fasta1 = os.path.join(tmp_tar, "aaa.fa")
        with open(sub_fasta1, "w") as rh:
            rh.write(self.example.sub_fasta1)
        sub_fasta2 = os.path.join(tmp_tar, "bbb.fa")
        with open(sub_fasta2, "w") as rh:
            rh.write(self.example.sub_fasta2)
        sub_gff1 = os.path.join(tmp_ref, "aaa.gff")
        with open(sub_gff1, "w") as rh:
            rh.write(self.example.sub_gff1)
        sub_gff2 = os.path.join(tmp_ref, "bbb.gff")
        with open(sub_gff2, "w") as rh:
            rh.write(self.example.sub_gff2)
        self.multiparser.combine_fasta(self.ref_folder, tmp_tar, None)
        self.assertTrue(os.path.exists(os.path.join(tmp_tar, "test.fa")))

    def test_combine_wig(self):
        tmp_tar = os.path.join(self.tar_folder, "tmp")
        tmp_ref = os.path.join(self.ref_folder, "test.fa_folder")
        os.mkdir(tmp_ref)
        os.mkdir(tmp_tar)
        sub_fasta1 = os.path.join(tmp_ref, "aaa.fa")
        with open(sub_fasta1, "w") as rh:
            rh.write(self.example.sub_fasta1)
        sub_fasta2 = os.path.join(tmp_ref, "bbb.fa")
        with open(sub_fasta2, "w") as rh:
            rh.write(self.example.sub_fasta2)
        sub_wig1 = os.path.join(tmp_tar, "test_forward.wig_STRAIN_aaa.wig")
        sub_wig2 = os.path.join(tmp_tar, "test_forward.wig_STRAIN_bbb.wig")
        sub_wig3 = os.path.join(tmp_tar, "test_reverse.wig_STRAIN_aaa.wig")
        sub_wig4 = os.path.join(tmp_tar, "test_reverse.wig_STRAIN_bbb.wig")
        wig_files = [sub_wig1, sub_wig2, sub_wig3, sub_wig4]
        example_wigs = [self.example.sub_f_wig1, self.example.sub_f_wig2,
                        self.example.sub_r_wig1, self.example.sub_r_wig2]
        for index in range(0, 4):
            with open(wig_files[index], "w") as fh:
                fh.write(example_wigs[index])
        libs = ["test_forward.wig_STRAIN_aaa.wig:frag:1:a:+", "test_reverse.wig_STRAIN_aaa.wig:frag:1:a:-"]
        self.multiparser.combine_wig(self.ref_folder, tmp_tar, "fasta", libs)
        self.assertTrue(os.path.exists(os.path.join(tmp_tar, "test_forward.wig")))
        self.assertTrue(os.path.exists(os.path.join(tmp_tar, "test_reverse.wig")))

    def test_combine_gff(self):
        tmp_tar = os.path.join(self.tar_folder, "tmp")
        tmp_ref = os.path.join(self.ref_folder, "test.fa_folder")
        os.mkdir(tmp_ref)
        os.mkdir(tmp_tar)
        sub_fasta1 = os.path.join(tmp_ref, "aaa.fa")
        with open(sub_fasta1, "w") as rh:
            rh.write(self.example.sub_fasta1)
        sub_fasta2 = os.path.join(tmp_ref, "bbb.fa")
        with open(sub_fasta2, "w") as rh:
            rh.write(self.example.sub_fasta2)
        sub_gff1 = os.path.join(tmp_tar, "aaa.gff")
        with open(sub_gff1, "w") as rh:
            rh.write(self.example.sub_gff1)
        sub_gff2 = os.path.join(tmp_tar, "bbb.gff")
        with open(sub_gff2, "w") as rh:
            rh.write(self.example.sub_gff2)
        self.multiparser.combine_gff(self.ref_folder, tmp_tar, "fasta", None)
        self.assertTrue(os.path.exists(os.path.join(tmp_tar, "test.gff")))

    def test_parser_fasta(self):
        fasta_file = os.path.join(self.ref_folder, "test.fa")
        with open(fasta_file, "w") as rh:
            rh.write(self.example.fasta_file)
        self.multiparser.parser_fasta(self.ref_folder)
        self.assertTrue(os.path.exists(os.path.join(self.ref_folder, "tmp/aaa.fa")))
        self.assertTrue(os.path.exists(os.path.join(self.ref_folder, "tmp/bbb.fa")))
        self.assertTrue(os.path.exists(os.path.join(self.ref_folder, "test.fa_folder/aaa.fa")))
        self.assertTrue(os.path.exists(os.path.join(self.ref_folder, "test.fa_folder/bbb.fa")))

    def test_parser_gff(self):
        gff_file = os.path.join(self.ref_folder, "test.gff")
        with open(gff_file, "w") as rh:
            rh.write(self.example.gff_file)
        self.multiparser.parser_gff(self.ref_folder, None)
        self.assertTrue(os.path.exists(os.path.join(self.ref_folder, "tmp/aaa.gff")))
        self.assertTrue(os.path.exists(os.path.join(self.ref_folder, "tmp/bbb.gff")))
        self.assertTrue(os.path.exists(os.path.join(self.ref_folder, "test.gff_folder/aaa.gff")))
        self.assertTrue(os.path.exists(os.path.join(self.ref_folder, "test.gff_folder/bbb.gff")))
        tss_file = os.path.join(self.ref_folder, "test_TSS.gff")
        os.rename(gff_file, tss_file)
        tss_file = os.path.join(self.ref_folder, "test_TSS.gff")
        with open(tss_file, "w") as rh:
            rh.write(self.example.gff_file)
        self.multiparser.parser_gff(self.ref_folder, "TSS")
        self.assertTrue(os.path.exists(os.path.join(self.ref_folder, "tmp/aaa_TSS.gff")))
        self.assertTrue(os.path.exists(os.path.join(self.ref_folder, "tmp/bbb_TSS.gff")))
        self.assertTrue(os.path.exists(os.path.join(self.ref_folder, "test_TSS.gff_folder/aaa_TSS.gff")))
        self.assertTrue(os.path.exists(os.path.join(self.ref_folder, "test_TSS.gff_folder/bbb_TSS.gff")))

    def test_parser_wig(self):
        wig_f_file = os.path.join(self.ref_folder, "test_forward.wig")
        with open(wig_f_file, "w") as rh:
            rh.write(self.example.wig_f_file)
        wig_r_file = os.path.join(self.ref_folder, "test_reverse.wig")
        with open(wig_r_file, "w") as rh:
            rh.write(self.example.wig_r_file)
        self.multiparser.parser_wig(self.ref_folder)
        self.assertTrue(os.path.exists(os.path.join(self.ref_folder, "tmp/test_forward_STRAIN_aaa.wig")))
        self.assertTrue(os.path.exists(os.path.join(self.ref_folder, "tmp/test_forward_STRAIN_bbb.wig")))
        self.assertTrue(os.path.exists(os.path.join(self.ref_folder, "tmp/test_reverse_STRAIN_aaa.wig")))
        self.assertTrue(os.path.exists(os.path.join(self.ref_folder, "tmp/test_reverse_STRAIN_bbb.wig")))
        self.assertTrue(os.path.exists(
            os.path.join(self.ref_folder, "test_forward.wig_folder/test_forward_STRAIN_aaa.wig")))
        self.assertTrue(os.path.exists(
            os.path.join(self.ref_folder, "test_forward.wig_folder/test_forward_STRAIN_bbb.wig")))
        self.assertTrue(os.path.exists(
            os.path.join(self.ref_folder, "test_reverse.wig_folder/test_reverse_STRAIN_aaa.wig")))
        self.assertTrue(os.path.exists(
            os.path.join(self.ref_folder, "test_reverse.wig_folder/test_reverse_STRAIN_bbb.wig")))
示例#8
0
class TSSpredator(object):

    def __init__(self, args_tss):
        self.multiparser = Multiparser()
        self.helper = Helper()
        self.converter = Converter()
        self.master = os.path.join(args_tss.out_folder, "MasterTables")
        self.tmps = {"tss": "tmp_TSS", "ta_tss": "tmp_ta_tss", "tss_ta":
                     "tmp_tss", "tmp": "tmp"}
        if args_tss.ta_files is not None:
            self.tmps["ta"] = os.path.join(args_tss.ta_files, "tmp")
        else:
            self.tmps["ta"] = None
        self.gff_path = os.path.join(args_tss.gffs, "tmp")
        if args_tss.manual is not None:
            self.manual_path = os.path.join(args_tss.manual, "tmp")
        self.wig_path = os.path.join(args_tss.wig_folder, "tmp")
        self.fasta_path = os.path.join(args_tss.fastas, "tmp")
        self.stat_outfolder = os.path.join(args_tss.out_folder, "statistics")
        self.gff_outfolder = os.path.join(args_tss.out_folder, "gffs")

    def _assign_dict(self, lib_datas):
        return {"wig": lib_datas[0],
                "tex": lib_datas[1],
                "condition": int(lib_datas[2]),
                "replicate": lib_datas[3],
                "strand": lib_datas[4]}

    def _print_lib(self, lib_num, lib_list, out, wig_folder, prefix, rep_set):
        for num_id in range(1, lib_num+1):
            cond_list = []
            for lib in lib_list:
                if num_id == lib["condition"]:
                    cond_list.append(lib)
            cond_sort_list = sorted(cond_list, key=lambda k: k['replicate'])
            reps = []
            for cond in cond_sort_list:
                out.write("{0}_{1}{2} = {3}\n".format(
                          prefix, cond["condition"], cond["replicate"],
                          os.path.join(wig_folder, cond["wig"])))
                reps.append(cond["replicate"])
            for rep in sorted(rep_set):
                if rep not in reps:
                    out.write("{0}_{1}{2} = \n".format(
                              prefix, cond["condition"], rep))

    def _start_to_run(self, tsspredator_path, config_file, out_path, prefix, log):
        print("Running TSSpredator for " + prefix)
        log.write("Make sure the version of TSSpredator is at least 1.06.\n")
        out = open(os.path.join(out_path, "log.txt"), "w")
        err = open(os.path.join(out_path, "err.txt"), "w")
        log.write(" ".join(["java", "-jar", tsspredator_path,
                            config_file]) + "\n")
        call(["java", "-jar", tsspredator_path,
              config_file], stdout=out, stderr=err)
        out.close()
        err.close()
        log.write("Done!\n")
        log.write("The following files are generated in {0}:\n".format(out_path))
        for file_ in os.listdir(out_path):
            log.write("\t" + file_ + "\n")

    def _import_lib(self, libs, wig_folder, project_strain_name,
                    out, gff, program, fasta):
        lib_dict = {"fp": [], "fm": [], "nm": [], "np": []}
        lib_num = 0
        rep_set = set()
        list_num_id = []
        for lib in libs:
            lib_datas = lib.split(":")
            if not lib_datas[0].endswith(".wig"):
                print("Error: Wiggle files are not end with .wig!")
                sys.exit()
            for wig in os.listdir(wig_folder):
                filename = wig.split("_STRAIN_")
                if (filename[0] == lib_datas[0][:-4]) and (
                        filename[1][:-4] == project_strain_name):
                    lib_datas[0] = wig
            if int(lib_datas[2]) > lib_num:
                lib_num = int(lib_datas[2])
            if lib_datas[3] not in rep_set:
                rep_set.add(lib_datas[3])
            if (lib_datas[1] == "tex") and (lib_datas[4] == "+"):
                lib_dict["fp"].append(self._assign_dict(lib_datas))
            elif (lib_datas[1] == "tex") and (lib_datas[4] == "-"):
                lib_dict["fm"].append(self._assign_dict(lib_datas))
            elif (lib_datas[1] == "notex") and (lib_datas[4] == "+"):
                lib_dict["np"].append(self._assign_dict(lib_datas))
            elif (lib_datas[1] == "notex") and (lib_datas[4] == "-"):
                lib_dict["nm"].append(self._assign_dict(lib_datas))
        for num_id in range(1, lib_num+1):
            out.write("annotation_{0} = {1}\n".format(num_id, gff))
        if program.lower() == "tss":
            self._print_lib(lib_num, lib_dict["fm"], out,
                            wig_folder, "fivePrimeMinus", rep_set)
            self._print_lib(lib_num, lib_dict["fp"], out,
                            wig_folder, "fivePrimePlus", rep_set)
        elif program.lower() == "ps":
            self._print_lib(lib_num, lib_dict["nm"], out,
                            wig_folder, "fivePrimeMinus", rep_set)
            self._print_lib(lib_num, lib_dict["np"], out,
                            wig_folder, "fivePrimePlus", rep_set)
        else:
            print("Error: Wrong program name! Please assing tss "
                  "or processing_site.")
            sys.exit()
        for num_id in range(1, lib_num+1):
            out.write("genome_{0} = {1}\n".format(num_id, fasta))
        for num_id in range(1, lib_num+1):
            list_num_id.append(str(num_id))
        return lib_num, num_id, rep_set, lib_dict, list_num_id

    def _print_repmatch(self, args_tss, out):
        '''check replicate match'''
        detect_all = False
        for rep in args_tss.repmatch:
            if "all" in rep:
                detect_all = True
                match = rep.split("_")[-1]
                out.write("minNumRepMatches = {0}\n".format(match))
                break
        if not detect_all:
            nums = {}
            matchs = {}
            for match in args_tss.repmatch:
                lib = match.split("_")[0]
                rep = match.split("_")[-1]
                matchs[lib] = rep
                if rep not in nums.keys():
                    nums[rep] = 1
                else:
                    nums[rep] += 1
            for rep, num in nums.items():
                if num == max(nums.values()):
                    out.write("minNumRepMatches = {0}\n".format(rep))
                    max_rep = rep
                    break
            for lib, rep in matchs.items():
                if rep != max_rep:
                    out.write("minNumRepMatches_{0} = {1}\n".format(
                        lib, rep))

    def _gen_config(self, project_strain_name, args_tss, gff,
                    wig_folder, fasta, config_file, log):
        '''generation of config files'''
        master_folder = "MasterTable_" + project_strain_name
        out_path = os.path.join(self.master, master_folder)
        self.helper.check_make_folder(out_path)
        out = open(config_file, "w")
        out.write("TSSinClusterSelectionMethod = HIGHEST\n")
        out.write("allowedCompareShift = 1\n")
        out.write("allowedRepCompareShift = 1\n")
        lib_num, num_id, rep_set, lib_dict, list_num_id = \
            self._import_lib(args_tss.libs, wig_folder, project_strain_name,
                             out, gff, args_tss.program, fasta)
        out.write("idList = ")
        out.write(",".join(list_num_id) + "\n")
        out.write("maxASutrLength = 100\n")
        out.write("maxGapLengthInGene = 500\n")
        out.write("maxNormalTo5primeFactor = {0}\n".format(
                  args_tss.processing_factor))
        out.write("maxTSSinClusterDistance = {0}\n".format(
                  args_tss.cluster + 1))
        out.write("maxUTRlength = {0}\n".format(args_tss.utr_length))
        out.write("min5primeToNormalFactor = {0}\n".format(
                  args_tss.enrichment_factor))
        out.write("minCliffFactor = {0}\n".format(args_tss.factor))
        out.write("minCliffFactorDiscount = {0}\n".format(
                  args_tss.factor_reduction))
        out.write("minCliffHeight = {0}\n".format(args_tss.height))
        out.write("minCliffHeightDiscount = {0}\n".format(
                  args_tss.height_reduction))
        out.write("minNormalHeight = {0}\n".format(args_tss.base_height))
        self._print_repmatch(args_tss, out)
        out.write("minPlateauLength = 0\n")
        out.write("mode = cond\n")
        out.write("normPercentile = 0.9\n")
        if args_tss.program.lower() == "tss":
            self._print_lib(lib_num, lib_dict["nm"], out,
                            wig_folder, "normalMinus", rep_set)
            self._print_lib(lib_num, lib_dict["np"], out,
                            wig_folder, "normalPlus", rep_set)
        else:
            self._print_lib(lib_num, lib_dict["fm"], out,
                            wig_folder, "normalMinus", rep_set)
            self._print_lib(lib_num, lib_dict["fp"], out,
                            wig_folder, "normalPlus", rep_set)
        out.write("numReplicates = {0}\n".format(len(rep_set)))
        out.write("numberOfDatasets = {0}\n".format(lib_num))
        out.write("outputDirectory = {0}\n".format(out_path))
        for prefix_id in range(len(args_tss.output_prefixs)):
            out.write("outputPrefix_{0} = {1}\n".format(
                      prefix_id + 1, args_tss.output_prefixs[prefix_id]))
        out.write("projectName = {0}\n".format(project_strain_name))
        out.write("superGraphCompatibility = igb\n")
        out.write("texNormPercentile = 0.5\n")
        out.write("writeGraphs = 0\n")
        out.write("writeNocornacFiles = 0\n")
        log.write("\t" + config_file + " is generated.\n")
        out.close()

    def _convert_gff(self, prefixs, args_tss, log):
        for prefix in prefixs:
            out_file = os.path.join(self.gff_outfolder, "_".join([
                           prefix, args_tss.program]) + ".gff")
            gff_f = open(out_file, "w")
            out_path = os.path.join(self.master, "_".join([
                           "MasterTable", prefix]))
            if "MasterTable.tsv" not in os.listdir(out_path):
                print("Error: There is not MasterTable file in {0} ".format(
                      out_path))
                print("Please check configuration file.")
                log.write("not MasterTable file is found in {0}\n".format(
                           out_path))
            else:
                if args_tss.program.lower() == "processing":
                    feature = "processing_site"
                elif args_tss.program.lower() == "tss":
                    feature = "TSS"
                self.converter.convert_mastertable2gff(
                    os.path.join(out_path, "MasterTable.tsv"),
                    "ANNOgesic", feature, prefix, out_file)
                log.write("\t" + out_file + "is generated.\n")
            gff_f.close()

    def _merge_manual(self, tsss, args_tss):
        '''if manual detected TSS is provided, it can merge manual detected TSS 
        and TSSpredator predicted TSS'''
        self.helper.check_make_folder(os.path.join(os.getcwd(),
                                      self.tmps["tss"]))
        for tss in tsss:
            for gff in os.listdir(args_tss.gffs):
                if (gff[:-4] == tss) and (".gff" in gff):
                    break
            filename = "_".join([tss, args_tss.program]) + ".gff"
            predict = os.path.join(self.gff_outfolder, filename)
            manual = os.path.join(self.manual_path, tss + ".gff")
            fasta = os.path.join(self.fasta_path, tss + ".fa")
            stat_file = "stat_compare_TSSpredator_manual_{0}.csv".format(tss)
            if os.path.exists(manual):
                print("Merging and classiflying manually-detected "
                      "TSSs for {0}".format(tss))
                merge_manual_predict_tss(
                    predict, stat_file,
                    os.path.join(self.tmps["tss"], filename),
                    os.path.join(args_tss.gffs, gff), args_tss, manual, fasta)
            if os.path.exists(stat_file):
                shutil.move(stat_file, os.path.join(
                    args_tss.out_folder, "statistics", tss, stat_file))
        self.helper.move_all_content(self.tmps["tss"],
                                     self.gff_outfolder, [".gff"])
        shutil.rmtree(self.tmps["tss"])

    def _validate(self, tsss, args_tss, log):
        '''validate TSS with genome annotation'''
        print("Validating TSSs with genome annotations")
        log.write("Running validate_gene.py to compare genome "
                  "annotations and TSSs/PSs.\n")
        for tss in tsss:
            for gff in os.listdir(args_tss.gffs):
                if (gff[:-4] == tss) and (".gff" in gff):
                    break
            stat_file = os.path.join(
                    self.stat_outfolder, tss,
                    "".join(["stat_gene_vali_", tss, ".csv"]))
            out_cds_file = os.path.join(args_tss.out_folder, "tmp.gff")
            if args_tss.program.lower() == "tss":
                compare_file = os.path.join(self.gff_outfolder,
                                            "_".join([tss, "TSS.gff"]))
            elif args_tss.program.lower() == "processing":
                compare_file = os.path.join(self.gff_outfolder,
                                            "_".join([tss, "processing.gff"]))
            validate_gff(compare_file, os.path.join(args_tss.gffs, gff),
                         stat_file, out_cds_file, args_tss.utr_length,
                         args_tss.program.lower())
            log.write("\t" + stat_file + " is generated.\n")
            shutil.move(out_cds_file, os.path.join(args_tss.gffs, gff))

    def _compare_ta(self, tsss, args_tss, log):
        '''compare TSS with transcript'''
        detect = False
        log.write("Running stat_TA_comparison to compare transcripts "
                  "and TSSs/PSs.\n")
        print("Comparing transcripts and TSSs")
        self.multiparser.parser_gff(args_tss.ta_files, "transcript")
        self.multiparser.combine_gff(args_tss.gffs, self.tmps["ta"],
                                     None, "transcript")
        for tss in tsss:
            stat_out = os.path.join(
                    self.stat_outfolder, tss, "".join([
                        "stat_compare_TSS_transcript_",
                        tss, ".csv"]))
            for ta in os.listdir(self.tmps["ta"]):
                filename = ta.split("_transcript")
                if (filename[0] == tss) and (filename[1] == ".gff"):
                    detect = True
                    break
            compare_file = os.path.join(self.gff_outfolder,
                                        "_".join([tss, "TSS.gff"]))
            if detect:
                stat_ta_tss(os.path.join(self.tmps["ta"], ta), compare_file,
                            stat_out, self.tmps["ta_tss"],
                            self.tmps["tss_ta"], args_tss.fuzzy)
                self.helper.sort_gff(self.tmps["tss_ta"], compare_file)
                self.helper.sort_gff(self.tmps["ta_tss"],
                                     os.path.join(args_tss.ta_files, ta))
                os.remove(self.tmps["tss_ta"])
                os.remove(self.tmps["ta_tss"])
                detect = False
            log.write("\t" + stat_out + " is generated.\n")

    def _stat_tss(self, tsss, feature, log):
        print("Running statistaics")
        for tss in tsss:
            compare_file = os.path.join(self.gff_outfolder,
                                        "_".join([tss, feature]) + ".gff")
            stat_tsspredator(
                compare_file, feature,
                os.path.join(self.stat_outfolder, tss, "_".join([
                    "stat", feature, "class", tss]) + ".csv"),
                os.path.join(self.stat_outfolder, tss, "_".join([
                    "stat", feature, "libs", tss]) + ".csv"))
            self.helper.move_all_content(os.getcwd(), os.path.join(
                self.stat_outfolder, tss), ["_class", ".png"])
            if os.path.exists(os.path.join(
                    self.stat_outfolder, "TSSstatistics.tsv")):
                shutil.move(
                    os.path.join(
                        self.stat_outfolder, "TSSstatistics.tsv"),
                    os.path.join(
                        self.stat_outfolder, tss, "TSSstatistics.tsv"))
            plot_venn(compare_file, feature)
            self.helper.move_all_content(os.getcwd(), os.path.join(
                self.stat_outfolder, tss), ["_venn", ".png"])
            log.write("The following files in {0} are generated:\n".format(
                (os.path.join(self.stat_outfolder, tss))))
            for file_ in os.listdir(os.path.join(
                    self.stat_outfolder, tss)):
                log.write("\t" + file_ + "\n")

    def _set_gen_config(self, args_tss, input_folder, log):
        prefixs = []
        detect = False
        log.write("Generating config files for TSSpredator.\n")
        for fasta in os.listdir(self.fasta_path):
            run = False
            for gff in os.listdir(self.gff_path):
                if fasta[:-3] == gff[:-4]:
                    prefix = fasta[:-3]
                    for wig in os.listdir(self.wig_path):
                        filename = wig.split("_STRAIN_")
                        if filename[1][:-4] == prefix:
                            detect = True
                            break
                    if detect:
                        prefixs.append(prefix)
                        config = os.path.join(
                                input_folder,
                                "_".join(["config", prefix]) + ".ini")
                        self._gen_config(
                            prefix, args_tss,
                            os.path.join(self.gff_path, gff), self.wig_path,
                            os.path.join(self.fasta_path, fasta), config, log)
        return prefixs

    def _merge_wigs(self, wig_folder, prefix, libs):
        self.helper.check_make_folder(os.path.join(os.getcwd(),
                                      self.tmps["tmp"]))
        for wig_file in os.listdir(wig_folder):
            for lib in libs:
                info = lib.split(":")
                if (info[0][:-4] in wig_file) and (info[-1] == "+") and (
                        prefix in wig_file) and (
                        os.path.isfile(os.path.join(wig_folder, wig_file))):
                    Helper().merge_file(
                            os.path.join(wig_folder, wig_file),
                            os.path.join("tmp", "merge_forward.wig"))
                if (info[0][:-4] in wig_file) and (info[-1] == "-") and (
                        prefix in wig_file) and (
                        os.path.isfile(os.path.join(wig_folder, wig_file))):
                    Helper().merge_file(
                            os.path.join(wig_folder, wig_file),
                            os.path.join("tmp", "merge_reverse.wig"))

    def _check_orphan(self, prefixs, wig_folder, args_tss):
        '''if genome has no locus tag, it can use for classify the TSS'''
        for prefix in prefixs:
            self._merge_wigs(wig_folder, prefix, args_tss.libs)
            tmp_tss = os.path.join(self.tmps["tmp"], "_".join([
                          prefix, args_tss.program + ".gff"]))
            pre_tss = os.path.join(self.gff_outfolder, "_".join([
                          prefix, args_tss.program + ".gff"]))
            check_orphan(pre_tss, os.path.join(
                args_tss.gffs, prefix + ".gff"),
                "tmp/merge_forward.wig", "tmp/merge_reverse.wig", tmp_tss)
            shutil.move(tmp_tss, pre_tss)
        shutil.rmtree("tmp")

    def _remove_files(self, args_tss):
        print("Remove temperary files and folders")
        self.helper.remove_tmp_dir(args_tss.fastas)
        self.helper.remove_tmp_dir(args_tss.gffs)
        self.helper.remove_tmp_dir(args_tss.ta_files)
        if "merge_forward.wig" in os.listdir(os.getcwd()):
            os.remove("merge_forward.wig")
        if "merge_reverse.wig" in os.listdir(os.getcwd()):
            os.remove("merge_reverse.wig")
        shutil.rmtree(args_tss.wig_folder)
        if args_tss.manual is not None:
            shutil.rmtree(args_tss.manual)

    def _deal_with_overlap(self, out_folder, args_tss):
        '''deal with the situation that TSS and 
        processing site at the same position'''
        if not args_tss.overlap_feature:
            pass
        else:
            print("Comparing TSSs and Processing sites")
            if args_tss.program.lower() == "tss":
                for tss in os.listdir(out_folder):
                    if tss.endswith("_TSS.gff"):
                        ref = self.helper.get_correct_file(
                                args_tss.overlap_gffs, "_processing.gff",
                                tss.replace("_TSS.gff", ""), None, None)
                        filter_tss_pro(os.path.join(out_folder, tss),
                                       ref, args_tss.program,
                                       args_tss.cluster)
            elif args_tss.program.lower() == "processing":
                for tss in os.listdir(out_folder):
                    if tss.endswith("_processing.gff"):
                        ref = self.helper.get_correct_file(
                                args_tss.overlap_gffs, "_TSS.gff",
                                tss.replace("_processing.gff", ""), None, None)
                        filter_tss_pro(os.path.join(out_folder, tss),
                                       ref, args_tss.program,
                                       args_tss.cluster)

    def _low_expression(self, args_tss, gff_folder):
        '''deal with the low expressed TSS'''
        prefix = None
        self._merge_wigs(args_tss.wig_folder, "wig", args_tss.libs)
        for gff in os.listdir(gff_folder):
            if (args_tss.program.lower() == "tss") and (
                    gff.endswith("_TSS.gff")):
                prefix = gff.replace("_TSS.gff", "")
            elif (args_tss.program.lower() == "processing") and (
                    gff.endswith("_processing.gff")):
                prefix = gff.replace("_processing.gff", "")
            if prefix:
                out = open(os.path.join(
                    self.stat_outfolder, prefix, "_".join([
                        "stat", prefix, "low_expression_cutoff.csv"])), "w")
                out.write("\t".join(["Genome", "Cutoff_coverage"]) + "\n")
                cutoff = filter_low_expression(
                        os.path.join(gff_folder, gff), args_tss,
                        "tmp/merge_forward.wig", "tmp/merge_reverse.wig",
                        "tmp/without_low_expression.gff")
                out.write("\t".join([prefix, str(cutoff)]) + "\n")
                os.remove(os.path.join(gff_folder, gff))
                shutil.move("tmp/without_low_expression.gff",
                            os.path.join(gff_folder, gff))
                prefix = None
        out.close()

    def run_tsspredator(self, args_tss, log):
        input_folder = os.path.join(args_tss.out_folder, "configs")
        for gff in os.listdir(args_tss.gffs):
            if gff.endswith(".gff"):
                self.helper.check_uni_attributes(os.path.join(
                                                 args_tss.gffs, gff))
        self.helper.check_make_folder(self.gff_outfolder)
        self.multiparser.parser_fasta(args_tss.fastas)
        self.multiparser.parser_gff(args_tss.gffs, None)
        self.multiparser.parser_wig(args_tss.wig_folder)
        prefixs = self._set_gen_config(args_tss, input_folder, log)
        for prefix in prefixs:
            out_path = os.path.join(
                    self.master, "_".join(["MasterTable", prefix]))
            config_file = os.path.join(
                    input_folder, "_".join(["config", prefix]) + ".ini")
            self._start_to_run(args_tss.tsspredator_path, config_file,
                               out_path, prefix, log)
            if os.path.exists(os.path.join(out_path, "TSSstatistics.tsv")):
                shutil.move(os.path.join(out_path, "TSSstatistics.tsv"),
                            os.path.join(
                                self.stat_outfolder, "TSSstatistics.tsv"))
        if args_tss.program.lower() == "ps":
            args_tss.program = "processing"
        self._convert_gff(prefixs, args_tss, log)
        if args_tss.check_orphan:
            print("checking the orphan TSSs")
            log.write("Running check_orphan.py to re-check orphan TSSs.\n")
            self._check_orphan(prefixs,
                               os.path.join(args_tss.wig_folder, "tmp"),
                               args_tss)
        self.multiparser.combine_gff(args_tss.gffs, self.gff_outfolder,
                                     None, args_tss.program)
        datas = []
        for gff in os.listdir(self.gff_outfolder):
            if gff.endswith(".gff"):
                gff_folder = gff.replace("".join(["_", args_tss.program,
                                                  ".gff"]), "")
                self.helper.check_make_folder(
                     os.path.join(self.stat_outfolder, gff_folder))
                datas.append(gff_folder)
        if args_tss.remove_low_expression is not None:
            log.write("Running filter_low_expression.py to filter out "
                      "low expressed TSS/PS.\n")
            self._low_expression(args_tss, self.gff_outfolder)
        if args_tss.manual is not None:
            self.multiparser.parser_gff(args_tss.manual, None)
            self.multiparser.combine_gff(args_tss.gffs, self.manual_path,
                                         None, None)
            self.multiparser.combine_fasta(args_tss.gffs, self.fasta_path,
                                         None)
            self.multiparser.combine_wig(args_tss.gffs, self.wig_path,
                                         None, args_tss.libs)
            log.write("Running merge_manual.py to merge the manual TSSs.\n")
            self._merge_manual(datas, args_tss)
        log.write("Running filter_TSS_pro.py to deal with the overlap "
                  "position between TSS and PS.\n")
        self._deal_with_overlap(self.gff_outfolder, args_tss)
        log.write("Running stat_TSSpredator.py to do statistics.\n")
        self._stat_tss(datas, args_tss.program, log)
        if args_tss.validate:
            self._validate(datas, args_tss, log)
        if args_tss.ta_files is not None:
            self._compare_ta(datas, args_tss, log)
        self._remove_files(args_tss)
示例#9
0
class sORFDetection(object):
    '''detection of sORF'''
    def __init__(self, args_sorf):
        self.multiparser = Multiparser()
        self.helper = Helper()
        if args_sorf.tsss is not None:
            self.tss_path = os.path.join(args_sorf.tsss, "tmp")
        else:
            self.tss_path = None
        if args_sorf.srnas is not None:
            self.srna_path = os.path.join(args_sorf.srnas, "tmp")
        else:
            self.srna_path = None
        self.gff_output = os.path.join(args_sorf.out_folder, "gffs")
        self.table_output = os.path.join(args_sorf.out_folder, "tables")
        self.tran_path = os.path.join(args_sorf.trans, "tmp")
        self.fasta_path = os.path.join(args_sorf.fastas, "tmp")
        self.all_cand = "all_candidates"
        self.best = "best_candidates"

    def _check_gff(self, gffs):
        for gff in os.listdir(gffs):
            if gff.endswith(".gff"):
                self.helper.check_uni_attributes(os.path.join(gffs, gff))

    def _check_necessary_files(self, args_sorf, log):
        if (args_sorf.gffs is None) or (args_sorf.trans is None) or (
            (args_sorf.tex_wigs is None) and (args_sorf.frag_wigs is None)):
            print("Error: lack required files!")
            log.write("genome annotation, transcript file or wiggle files "
                      "are not assigned.\n")
            sys.exit()
        if args_sorf.utr_detect:
            if (args_sorf.tsss is None):
                print("Error: TSS files are required for UTR derived"
                      " sORF detection!")
                log.write("TSS files are required for UTR derived"
                          " sORF detection!\n")
                sys.exit()
        self._check_gff(args_sorf.gffs)
        self.multiparser.parser_gff(args_sorf.gffs, None)
        if args_sorf.tsss is not None:
            self._check_gff(args_sorf.tsss)
            self.multiparser.parser_gff(args_sorf.tsss, "TSS")
            self.multiparser.combine_gff(args_sorf.gffs, self.tss_path, None,
                                         "TSS")
        self._check_gff(args_sorf.trans)
        if args_sorf.srnas is not None:
            self._check_gff(args_sorf.srnas)
            self.multiparser.parser_gff(args_sorf.srnas, "sRNA")
            self.multiparser.combine_gff(args_sorf.gffs, self.srna_path, None,
                                         "sRNA")

    def _start_stop_codon(self, prefixs, args_sorf, log):
        '''detect the sORF based on start and stop codon 
        and ribosome binding site'''
        log.write("Running sORF_detection.py for detecting sORFs.\n")
        log.write("The following files are generated:\n")
        for prefix in prefixs:
            print("Searching sORFs of {0}".format(prefix))
            if self.srna_path is not None:
                srna_file = os.path.join(self.srna_path,
                                         "_".join([prefix, "sRNA.gff"]))
            else:
                srna_file = None
            if self.tss_path is not None:
                tss_file = os.path.join(self.tss_path,
                                        "_".join([prefix, "TSS.gff"]))
            else:
                tss_file = None
            sorf_detection(
                os.path.join(self.fasta_path, prefix + ".fa"), srna_file,
                os.path.join(args_sorf.out_folder,
                             "_".join([prefix, "inter.gff"])), tss_file,
                os.path.join(args_sorf.wig_path,
                             "_".join([prefix, "forward.wig"])),
                os.path.join(args_sorf.wig_path,
                             "_".join([prefix, "reverse.wig"])),
                os.path.join(self.gff_output, self.all_cand,
                             "_".join([prefix, "sORF"])), args_sorf)
            if "_".join([prefix, "sORF_all.gff"]) in os.listdir(
                    os.path.join(self.gff_output, self.all_cand)):
                gff_all = os.path.join(self.gff_output, self.all_cand,
                                       "_".join([prefix, "sORF.gff"]))
                gff_best = os.path.join(self.gff_output, self.best,
                                        "_".join([prefix, "sORF.gff"]))
                csv_all = os.path.join(self.table_output, self.all_cand,
                                       "_".join([prefix, "sORF.csv"]))
                csv_best = os.path.join(self.table_output, self.best,
                                        "_".join([prefix, "sORF.csv"]))
                shutil.move(
                    os.path.join(self.gff_output, self.all_cand,
                                 "_".join([prefix, "sORF_all.gff"])), gff_all)
                shutil.move(
                    os.path.join(self.gff_output, self.all_cand,
                                 "_".join([prefix, "sORF_best.gff"])),
                    gff_best)
                shutil.move(
                    os.path.join(self.gff_output, self.all_cand,
                                 "_".join([prefix, "sORF_all.csv"])), csv_all)
                shutil.move(
                    os.path.join(self.gff_output, self.all_cand,
                                 "_".join([prefix, "sORF_best.csv"])),
                    csv_best)
                log.write("\t" + gff_all + "\n")
                log.write("\t" + gff_best + "\n")
                log.write("\t" + csv_all + "\n")
                log.write("\t" + csv_best + "\n")

    def _remove_tmp(self, args_sorf):
        self.helper.remove_all_content(args_sorf.out_folder, ".gff", "file")
        self.helper.remove_tmp_dir(args_sorf.fastas)
        self.helper.remove_tmp_dir(args_sorf.gffs)
        self.helper.remove_tmp_dir(args_sorf.tsss)
        self.helper.remove_tmp_dir(args_sorf.trans)
        self.helper.remove_tmp_dir(args_sorf.srnas)
        if "temp_wig" in os.listdir(args_sorf.out_folder):
            shutil.rmtree(os.path.join(args_sorf.out_folder, "temp_wig"))
        if "merge_wigs" in os.listdir(args_sorf.out_folder):
            shutil.rmtree(os.path.join(args_sorf.out_folder, "merge_wigs"))

    def _compare_tran_cds(self, args_sorf, log):
        '''compare transcript and CDS to find the intergenic region'''
        prefixs = []
        log.write("Running sORF_intergenic.py to extract the sequences of "
                  "potential sORFs\n")
        for gff in os.listdir(args_sorf.gffs):
            if gff.endswith(".gff"):
                prefix = gff.replace(".gff", "")
                prefixs.append(prefix)
                print("Comparing transcripts and CDSs of {0}".format(prefix))
                get_intergenic(
                    os.path.join(args_sorf.gffs, gff),
                    os.path.join(self.tran_path,
                                 "_".join([prefix, "transcript.gff"])),
                    os.path.join(args_sorf.out_folder,
                                 "_".join([prefix, "inter.gff"])),
                    args_sorf.utr_detect, args_sorf.hypo, args_sorf.extend_5,
                    args_sorf.extend_3)
                log.write("\t" + os.path.join(args_sorf.out_folder, "_".join(
                    [prefix, "inter.gff"])) +
                          " is generated to temporary store the sequences.\n")
        return prefixs

    def _re_table(self, args_sorf, prefixs, log):
        log.write("Running re_table.py for generating coverage information.\n")
        log.write("The following files are updated:\n")
        for type_ in ["all_candidates", "best_candidates"]:
            for prefix in prefixs:
                table_file = os.path.join(args_sorf.out_folder, "tables",
                                          type_, "_".join([prefix,
                                                           "sORF.csv"]))
                reorganize_table(args_sorf.libs, args_sorf.merge_wigs,
                                 "Track_detail", table_file)
                log.write("\t" + table_file + "\n")

    def run_sorf_detection(self, args_sorf, log):
        if args_sorf.fuzzy_rbs > 6:
            log.write("--fuzzy_rbs should be equal or less than 6!\n")
            print("Error: --fuzzy_rbs should be equal or less than 6!")
            sys.exit()
        self._check_necessary_files(args_sorf, log)
        self.multiparser.parser_gff(args_sorf.trans, "transcript")
        self.multiparser.combine_gff(args_sorf.gffs, self.tran_path, None,
                                     "transcript")
        self.multiparser.parser_fasta(args_sorf.fastas)
        self.multiparser.combine_fasta(args_sorf.gffs, self.fasta_path, None)
        prefixs = self._compare_tran_cds(args_sorf, log)
        self._start_stop_codon(prefixs, args_sorf, log)
        log.write("Running stat_sorf.py to do statistics.\n")
        for sorf in os.listdir(os.path.join(self.gff_output, self.all_cand)):
            print("Running statistics of {0}".format(sorf))
            if sorf.endswith("_sORF.gff"):
                stat_file = os.path.join(
                    args_sorf.out_folder, "statistics",
                    "_".join(["stat", sorf.replace(".gff", ".csv")]))
                stat(os.path.join(self.gff_output, self.all_cand, sorf),
                     os.path.join(self.gff_output, self.best, sorf), stat_file,
                     args_sorf.utr_detect)
                log.write("\t" + stat_file + " is generated.\n")
        self._re_table(args_sorf, prefixs, log)
        self._remove_tmp(args_sorf)