Exemplo n.º 1
0
 def remapping(self):
     if file.which("samtools") is None:
         raise EnvironmentError(
             "Could not find executable: {}".format("samtools"))
     elif file.which("bwa") is None:
         raise EnvironmentError(
             "Could not find executable: {}".format("bwa"))
     else:
         file.isdir(self._outdir)
         if self._cpu == 1:
             cpu = 1
         elif self._cpu > 1:
             cpu = self._cpu - 1
         else:
             logging.info("-n CPU should be a integer >=1 (default = 1)")
         self.index()
         self.remap(cpu)
         self.process_bam(cpu)
         if os.stat(
                 os.path.join(self._outdir, self._prefix +
                              "_unique_depth.tab")).st_size > 0:
             self.clean()
             logging.debug("Remapping and parsing done")
         else:
             logging.info("Error: remapping and parsing failed")
Exemplo n.º 2
0
def main():
    parser = argparse.ArgumentParser(
        usage=
        "iqkm -i metagenome -o out_dir --help_dir help_dir --fq fastq1 --rq fastq2 --meta --quantify",
        description=
        "Workflow for KM assignment and/or quantification, on both contig and sample basis",
        add_help=False,
    )
    required = parser.add_argument_group("required arguments")
    optional = parser.add_argument_group("optional arguments")
    required.add_argument(
        "-i",
        "--input",
        dest="genome",
        help="Input genome/metagenomes, required",
        required=True,
    )
    required.add_argument(
        "-o",
        "--out_dir",
        dest="outdir",
        help="Output folder",
        required=True,
    )
    required.add_argument(
        "--help_dir",
        dest="help_dir",
        help=
        "Folder containing Kofam HMM database and help files, refer to README.md for downloading",
        required=True,
    )
    optional.add_argument(
        "--fq",
        dest="fastq1",
        help=
        "Input first or only read file (fastq or fastq.gz), required when '--quantify' is specified",
        required=False,
    )
    optional.add_argument("-h", "--help", action="help")
    optional.add_argument(
        "--rq",
        dest="fastq2",
        help="Input reverse read (fastq or fastq.gz format), optional",
        default=None,
    )
    optional.add_argument(
        "--prefix",
        dest="prefix",
        help=
        "Prefix of output files, default: your input genome/metagenome file name without postfix",
        default=None,
    )
    optional.add_argument(
        "--db",
        dest="hmmdb",
        help=
        "Kofam HMM database for KO assignment, default path='/help_dir/db/kofam.hmm', you can change it to your customised db",
        default=None,
    )
    optional.add_argument(
        "--com",
        dest="com",
        help=
        "KM completeness threshold on contig basis (only KM with completeness above the threshold will be considered present), default = 66.67",
        default=66.67,
    )
    optional.add_argument(
        "--skip",
        action="store_true",
        help=
        "Force skip steps if relevant output files have been found under designated directories, not recommanded if your input file is newer (default = False)",
        default=False,
    )
    optional.add_argument(
        "-q",
        "--quantify",
        action="store_true",
        help=
        "Run both KM assignment and quantification (default = False, add '-q' or '--quantify' to enable)",
        default=False,
    )
    optional.add_argument(
        "-m",
        "--meta",
        action="store_true",
        help="Running in metagenome mode (prodigal -p meta; default = False)",
        default=False,
    )
    optional.add_argument(
        "-w",
        "--include_weights",
        dest="include_weights",
        help=
        "Include weights of each KO when doing KM assignment (default = True)",
        default=True,
    )
    optional.add_argument(
        "-n",
        "--threads",
        dest="cpu",
        help="Number of threads used for computation (default = 1)",
        default=1,
    )
    optional.add_argument(
        "-f",
        "--force",
        action="store_true",
        help=
        "Force reruning the whole pipeline, don't resume previous run (default = False)",
        default=False,
    )
    optional.add_argument(
        "-d",
        "--dist",
        action="store_true",
        help="Apply KM minimum distance threshold (default = True)",
        default=True,
    )
    optional.add_argument(
        "-g",
        "--genome_equivalent",
        dest="GE",
        help=
        "Genome equivalent output generated from microbe-census, can be used for library-size normalization when doing quantification. Optional (default: None)",
        default=None,
    )

    if len(sys.argv) == 1:
        parser.print_help()
    else:
        logging.basicConfig(
            format="%(asctime)s %(message)s",
            datefmt="%m/%d/%Y %H:%M:%S: ",
            level=logging.INFO,
        )
        args = parser.parse_args()
        logging.info("iqKM version {}".format(iqkm.version.__version__))
        if not file.exists(args.genome):
            logging.error(
                "Please provide the right path of input genome/metagenome file (fasta format)"
            )
        if not file.isdir(args.help_dir):
            logging.error(
                "Please provide the right path for help_files, refer to README.md for download help_dir"
            )
        if args.quantify:
            logging.info(
                "Running iqKM for both KM assignment and quantification")
            if not file.exists(args.fastq1):
                logging.error(
                    "Please provide the right path of raw reads file (fastq format) for KM quantification"
                )
            else:
                Workflow_iqkm(args.genome, args.fastq1, args.fastq2,
                              args.hmmdb, args.prefix, args.outdir,
                              args.help_dir, args.GE, args.meta, "hmmsearch",
                              args.force, args.dist, args.com,
                              args.include_weights, args.cpu, "prodigal",
                              args.skip)
        else:
            logging.info("Running iqKM for KM assignment")
            Workflow_identify(args.genome, args.hmmdb, args.prefix,
                              args.outdir, args.help_dir, args.meta,
                              "hmmsearch", args.force, args.dist, args.com,
                              args.include_weights, args.cpu, "prodigal",
                              args.skip)
Exemplo n.º 3
0
    def __init__(self,
                 fna,
                 fsq1,
                 fsq2,
                 db,
                 prefix,
                 outdir,
                 help_dir,
                 GE,
                 meta=False,
                 ko_anno_tool="hmmsearch",
                 force=False,
                 dist=True,
                 com=66.67,
                 include_weights=True,
                 cpu=1,
                 gene_prediction_tool="prodigal",
                 skip=False):

        self._fna = fna
        self._fq1 = fsq1
        self._fq2 = fsq2
        self._hmmdb = db
        self._prefix = prefix
        self._GE = GE
        self._meta = meta
        self._cpu = cpu
        self._force = force
        self._dist = dist
        self._com = com
        self._include_weights = include_weights
        self._gene_predict_tool = gene_prediction_tool
        self._ko_anno_tool = ko_anno_tool
        self._outdir = outdir
        self._help_dir = help_dir
        self._skip = skip

        if self._prefix is None:
            self._prefix = ".".join(
                (os.path.basename(self._fna)).split(".")[:-1])

        # run prodigal
        logging.info("Running prodigal")
        file.isdir(os.path.join(self._outdir, "prodigal"))
        out_pep = os.path.join(self._outdir, "prodigal", self._prefix + ".pep")
        out_cds = os.path.join(self._outdir, "prodigal", self._prefix + ".cds")
        out_gff = os.path.join(self._outdir, "prodigal", self._prefix + ".gff")
        if self._force:
            cls_prod = Prodigal(self._fna, self._outdir, self._meta)
            cls_prod.run_prodigal(out_pep, out_cds, out_gff)
        elif self._skip:
            if file.exists(out_cds) and file.exists(out_pep):
                logging.info("Force skipping prodigal as user used '--skip'")
            else:
                logging.info(
                    "Failed to skip prodigal as prodigal output are missing")
                logging.info("Running prodigal")
                cls_prod = Prodigal(self._fna, self._outdir, self._meta)
                cls_prod.run_prodigal(out_pep, out_cds, out_gff)
        else:
            if file.isnewer(self._fna, out_pep):
                cls_prod = Prodigal(self._fna, self._outdir, self._meta)
                cls_prod.run_prodigal(out_pep, out_cds, out_gff)
            else:
                logging.info(
                    "Skip prodigal because {} is newer than {}, add '--force' if you want to rerun the computation"
                    .format(out_pep, self._fna))

        # run bwa to remap reads to *.cds to quantify genes/KOs
        logging.info("Run remapping to quantify genes/KOs")
        file.isdir(os.path.join(self._outdir), "out_remap")
        remap_dir = os.path.join((self._outdir), "out_remap")
        remap_out = os.path.join(remap_dir, self._prefix + "_unique.tab")
        if self._force:
            remap_cls = Remapping(out_cds, self._fq1, self._fq2, remap_dir,
                                  self._prefix, self._cpu)
            remap_cls.remapping()
        elif self._skip:
            if file.exists(remap_out):
                logging.info(
                    "Force skipping bwa mapping as user used '--skip'")
            else:
                logging.info(
                    "Failed to skip bwa mapping as mapping output is missing")
                logging.info("Run remapping to quantify genes/KOs")
                remap_cls = Remapping(out_cds, self._fq1, self._fq2, remap_dir,
                                      self._prefix, self._cpu)
                remap_cls.remapping()
        else:
            if file.isnewer(out_cds, remap_out) or file.isnewer(
                    self._fq1, remap_out):
                remap_cls = Remapping(out_cds, self._fq1, self._fq2, remap_dir,
                                      self._prefix, self._cpu)
                remap_cls.remapping()
            else:
                logging.info(
                    "Skip remapping because {} and {} is newer than {}, add '--force' if you want to rerun the computation"
                    .format(out_cds, self._fq1, remap_out))

        # run hmmsearch
        logging.info("Running hmmsearch")
        file.isdir(os.path.join(self._outdir, "hmmsearch"))
        hmm_out = os.path.join(self._outdir, "hmmsearch",
                               self._prefix + "_hmmsearch.tbl")
        hmm_log = os.path.join(self._outdir, "hmmsearch",
                               self._prefix + "_hmmsearch.log")
        if self._hmmdb is None:
            self._hmmdb = os.path.join(self._help_dir, "db/kofam.hmm")
        if self._force:
            hmm_cls = Hmmsearch(out_pep, self._cpu, self._outdir, self._hmmdb)
            hmm_cls.hmmsearch(hmm_out, hmm_log)
        elif self._skip:
            if file.exists(hmm_out):
                logging.info("Force skipping hmmsearch as user used '--skip'")
            else:
                logging.info(
                    "Failed to skip hmmsearch as hmmsearch output is missing")
                logging.info("Running hmmsearch")
                hmm_cls = Hmmsearch(out_pep, self._cpu, self._outdir,
                                    self._hmmdb)
                hmm_cls.hmmsearch(hmm_out, hmm_log)
        else:
            if file.isnewer(out_pep, hmm_out):
                hmm_cls = Hmmsearch(out_pep, self._cpu, self._outdir,
                                    self._hmmdb)
                hmm_cls.hmmsearch(hmm_out, hmm_log)
            else:
                logging.info(
                    "Skip hmmsearch because {} is newer than {}, add '--force' if you want to rerun the computation"
                    .format(hmm_out, out_pep))

        # parse KO, the result is under dir(ourdir + "KO_parsing")
        logging.info("Parsing KO")
        file.isdir(os.path.join(self._outdir, "KO_parsing"))
        ko_output = os.path.join(self._outdir, "KO_parsing",
                                 self._prefix + ".ko")
        if self._force:
            parse_cls = ParseKo(
                self._ko_anno_tool,
                self._gene_predict_tool,
                out_pep,
                hmm_out,
                self._outdir,
            )
            parse_cls.write_out(ko_output)
            d_nuc_ko = parse_cls.parse_kohmm()
            d_ko_position, d_position_gene = (parse_cls.parseKo())[1:]
        elif self._skip:
            if file.exists(ko_output):
                logging.info("Force skipping parsing KO as user used '--skip'")
                parse_cls = ParseKo(self._ko_anno_tool,
                                    self._gene_predict_tool, out_pep, hmm_out,
                                    self._outdir)
                d_nuc_ko = parse_cls.parse_kohmm()
                d_ko_position, d_position_gene = (parse_cls.parseKo())[1:]
            else:
                logging.info(
                    "Failed to skip KO parsing as KO parsing output is missing"
                )
                logging.info("Parsing KO")
                parse_cls = ParseKo(
                    self._ko_anno_tool,
                    self._gene_predict_tool,
                    out_pep,
                    hmm_out,
                    self._outdir,
                )
                parse_cls.write_out(ko_output)
                d_nuc_ko = parse_cls.parse_kohmm()
                d_ko_position, d_position_gene = (parse_cls.parseKo())[1:]
        else:
            if file.isnewer(hmm_out, ko_output):
                parse_cls = ParseKo(
                    self._ko_anno_tool,
                    self._gene_predict_tool,
                    out_pep,
                    hmm_out,
                    self._outdir,
                )
                parse_cls.write_out(ko_output)
                d_nuc_ko = parse_cls.parse_kohmm()
                d_ko_position, d_position_gene = (parse_cls.parseKo())[1:]
            else:
                logging.info(
                    "Skip parsing KO because {} is newer than {}, add '--force' if you want to rerun the computation"
                    .format(ko_output, hmm_out))
                parse_cls = ParseKo(
                    self._ko_anno_tool,
                    self._gene_predict_tool,
                    out_pep,
                    hmm_out,
                    self._outdir,
                )
                d_nuc_ko = parse_cls.parse_kohmm()
                d_ko_position, d_position_gene = (parse_cls.parseKo())[1:]

        # Assigning KM
        logging.info("Assigning KM")
        file.isdir(os.path.join(self._outdir, "KM_assignment_unfiltered"))
        help_graphs = os.path.join(self._help_dir, "help_files/graphs.pkl")
        help_classes = os.path.join(self._help_dir,
                                    "help_files/all_pathways_class.txt")
        help_names = os.path.join(self._help_dir,
                                  "help_files/all_pathways_names.txt")
        (
            graphs,
            pathway_names,
            pathway_classes,
        ) = iqkm.give_pathways_weight.download_pathways(
            help_graphs, help_names, help_classes)
        kegg_output = os.path.join(self._outdir, "KM_assignment_unfiltered",
                                   self._prefix + ".summary.kegg")
        # COMMON INFO
        using_graphs = copy.deepcopy(graphs)
        kegg_output_pathway = kegg_output + "_pathways.tsv"
        if self._force:
            edges, dict_KO_by_contigs = iqkm.give_pathways_weight.get_list_items(
                ko_output)
            file_out_summary = open(kegg_output_pathway, "wt")
            iqkm.give_pathways_weight.set_headers(file_out_summary, False)
            weights_of_KOs = iqkm.give_pathways_weight.get_weights_for_KOs(
                using_graphs)
            iqkm.give_pathways_weight.sort_out_pathways(
                using_graphs,
                edges,
                pathway_names,
                pathway_classes,
                "",
                file_out_summary,
                weights_of_KOs,
                self._include_weights,
            )
            file_out_summary.close()
        elif self._skip:
            if file.exists(kegg_output_pathway):
                logging.info(
                    "Force skipping KM assignment as user used '--skip'")
            else:
                logging.info(
                    "Failed to skip KM assignment as KM assignment output is missing"
                )
                logging.info("Assigning KM")
                edges, dict_KO_by_contigs = iqkm.give_pathways_weight.get_list_items(
                    ko_output)
                file_out_summary = open(kegg_output_pathway, "wt")
                iqkm.give_pathways_weight.set_headers(file_out_summary, False)
                weights_of_KOs = iqkm.give_pathways_weight.get_weights_for_KOs(
                    using_graphs)
                iqkm.give_pathways_weight.sort_out_pathways(
                    using_graphs,
                    edges,
                    pathway_names,
                    pathway_classes,
                    "",
                    file_out_summary,
                    weights_of_KOs,
                    self._include_weights,
                )
                file_out_summary.close()
        else:
            if file.isnewer(ko_output, kegg_output_pathway):
                edges, dict_KO_by_contigs = iqkm.give_pathways_weight.get_list_items(
                    ko_output)
                file_out_summary = open(kegg_output_pathway, "wt")
                iqkm.give_pathways_weight.set_headers(file_out_summary, False)
                weights_of_KOs = iqkm.give_pathways_weight.get_weights_for_KOs(
                    using_graphs)
                iqkm.give_pathways_weight.sort_out_pathways(
                    using_graphs,
                    edges,
                    pathway_names,
                    pathway_classes,
                    "",
                    file_out_summary,
                    weights_of_KOs,
                    self._include_weights,
                )
                file_out_summary.close()
            else:
                logging.info(
                    "Skip KM assignment because {} is newer than {}, add '--force' if you want to rerun the computation"
                    .format(kegg_output_pathway, ko_output))

        # BY CONTIGS
        kegg_output_contig = kegg_output + "_contigs.tsv"
        if self._force:
            (
                graphs,
                pathway_names,
                pathway_classes,
            ) = iqkm.give_pathways_weight.download_pathways(
                help_graphs, help_names, help_classes)
            edges, dict_KO_by_contigs = iqkm.give_pathways_weight.get_list_items(
                ko_output)
            file_out_summary = open(kegg_output_contig, "wt")
            iqkm.give_pathways_weight.set_headers(file_out_summary, True)
            for contig in dict_KO_by_contigs:
                using_graphs = copy.deepcopy(graphs)
                edges = dict_KO_by_contigs[contig]
                iqkm.give_pathways_weight.sort_out_pathways(
                    using_graphs,
                    edges,
                    pathway_names,
                    pathway_classes,
                    contig,
                    file_out_summary,
                    weights_of_KOs,
                    self._include_weights,
                )
            file_out_summary.close()
        elif self._skip:
            if file.exists(kegg_output_contig):
                logging.info(
                    "Force skipping KM assignment as user used '--skip'")
            else:
                logging.info(
                    "Failed to skip KM assignment as KM assignment output is missing"
                )
                logging.info("Assigning KM")
                (
                    graphs,
                    pathway_names,
                    pathway_classes,
                ) = iqkm.give_pathways_weight.download_pathways(
                    help_graphs, help_names, help_classes)
                edges, dict_KO_by_contigs = iqkm.give_pathways_weight.get_list_items(
                    ko_output)
                file_out_summary = open(kegg_output_contig, "wt")
                iqkm.give_pathways_weight.set_headers(file_out_summary, True)
                for contig in dict_KO_by_contigs:
                    using_graphs = copy.deepcopy(graphs)
                    edges = dict_KO_by_contigs[contig]
                    iqkm.give_pathways_weight.sort_out_pathways(
                        using_graphs,
                        edges,
                        pathway_names,
                        pathway_classes,
                        contig,
                        file_out_summary,
                        weights_of_KOs,
                        self._include_weights,
                    )
                file_out_summary.close()

        else:
            if file.isnewer(ko_output, kegg_output_contig):
                (
                    graphs,
                    pathway_names,
                    pathway_classes,
                ) = iqkm.give_pathways_weight.download_pathways(
                    help_graphs, help_names, help_classes)
                edges, dict_KO_by_contigs = iqkm.give_pathways_weight.get_list_items(
                    ko_output)
                file_out_summary = open(kegg_output_contig, "wt")
                iqkm.give_pathways_weight.set_headers(file_out_summary, True)
                for contig in dict_KO_by_contigs:
                    using_graphs = copy.deepcopy(graphs)
                    edges = dict_KO_by_contigs[contig]
                    iqkm.give_pathways_weight.sort_out_pathways(
                        using_graphs,
                        edges,
                        pathway_names,
                        pathway_classes,
                        contig,
                        file_out_summary,
                        weights_of_KOs,
                        self._include_weights,
                    )
                file_out_summary.close()
            else:
                logging.info(
                    "Skip KM assignment because {} is newer than {}, add '--force' if you want to rerun the computation"
                    .format(kegg_output_contig, ko_output))
        # calculate the minimum dist, and apply dist and com threshold (or not) on contig basis, apply com threhold on sample basis
        logging.info("Calculating minimum distance within each KM")
        file.isdir(os.path.join(self._outdir, "KM_assignment_filtered"))
        out_dist = os.path.join(self._outdir, "KM_assignment_filtered",
                                self._prefix + "_km_on_contig.tsv")
        out_count = os.path.join(self._outdir, "KM_assignment_filtered",
                                 self._prefix + "_km_sample_count.tsv")
        if self._force:
            km = KM_dist(kegg_output_contig, self._com, self._ko_anno_tool,
                         self._gene_predict_tool, hmm_out, out_pep, self._cpu,
                         self._dist, self._outdir, self._help_dir)
            km.km_dist(d_ko_position, out_dist, out_count)
        elif self._skip:
            if file.exists(out_count) and file.exists(out_dist):
                logging.info(
                    "Force skipping KM minimum distance calculation as user used '--skip'"
                )
            else:
                logging.info(
                    "Failed to skip KM minimum distance calculation as output is missing"
                )
                logging.info("Calculating minimum distance within each KM")
                km = KM_dist(kegg_output_contig, self._com, self._ko_anno_tool,
                             self._gene_predict_tool, hmm_out, out_pep,
                             self._cpu, self._dist, self._outdir,
                             self._help_dir)
                km.km_dist(d_ko_position, out_dist, out_count)
        else:
            if file.isnewer(kegg_output_contig, out_count):
                km = KM_dist(kegg_output_contig, self._com, self._ko_anno_tool,
                             self._gene_predict_tool, hmm_out, out_pep,
                             self._cpu, self._dist, self._outdir,
                             self._help_dir)
                km.km_dist(d_ko_position, out_dist, out_count)
            else:
                logging.info(
                    "Skip KM minimum distance calculation because {} is newer than {}, add '--force' if you want to rerun the computation"
                    .format(out_count, kegg_output_contig))

        # calculate the minimum dist within each KM, normalized KM abundance (with GE) or non-normalized KM abundance, both on contig and sample basis
        logging.info("Calculating KM abundance")
        file.isdir(os.path.join(self._outdir, "out_abundance"))
        file.isdir(os.path.join(self._outdir, "out_abundance", "ko_abd"))
        file.isdir(os.path.join(self._outdir, "out_abundance",
                                "km_abd_sample"))
        file.isdir(os.path.join(self._outdir, "out_abundance",
                                "km_abd_contig"))
        output_ko = os.path.join(self._outdir, "out_abundance", "ko_abd",
                                 self._prefix + "_ko_abd.tsv")
        output_km_contig = os.path.join(
            self._outdir,
            "out_abundance",
            "km_abd_contig",
            self._prefix + "_km_contig_abd.tsv",
        )
        out_km_sample = os.path.join(
            self._outdir,
            "out_abundance",
            "km_abd_sample",
            self._prefix + "_km_sample_abd.tsv",
        )

        abd_cls = KM_abd(self._GE, remap_out, kegg_output_contig, self._com,
                         self._ko_anno_tool, self._gene_predict_tool, hmm_out,
                         out_pep, self._dist, self._outdir, self._help_dir)
        abd_cls.km_abd(
            d_nuc_ko,
            d_ko_position,
            d_position_gene,
            output_ko,
            output_km_contig,
            out_km_sample,
        )
    def __init__(self,
                 ffn,
                 faa,
                 db,
                 fp,
                 gene_predict_tool,
                 prefix,
                 ko_anno_tool="hmmsearch",
                 force=False,
                 dist=True,
                 com=66.67,
                 include_weights=True,
                 cpu=1,
                 outdir="./out"):

        self._ffn = ffn
        self._faa = faa
        self._hmmdb = db
        self._fp = fp
        self._prefix = prefix
        self._cpu = cpu
        self._force = force
        self._dist = dist
        self._com = com
        self._include_weights = include_weights
        self._gene_predict_tool = gene_predict_tool
        self._ko_anno_tool = ko_anno_tool
        self._outdir = outdir

        if self._prefix is None:
            self._prefix = ".".join(
                (os.path.basename(self._faa)).split(".")[:-1])
        if self._fp is None:
            if self._gene_predict_tool == "prodigal":
                self._fp = self._faa
            else:
                logging.error(
                    "Please provide gene prediction file (prokka output *.gff)"
                )

        pkg_dir = os.path.dirname(os.path.abspath(__file__))

        # run hmmsearch
        logging.info("Running hmmsearch")
        file.isdir(os.path.join(self._outdir, "hmmsearch"))
        hmm_out = os.path.join(self._outdir, "hmmsearch",
                               self._prefix + "_hmmsearch.tbl")
        hmm_log = os.path.join(self._outdir, "hmmsearch",
                               self._prefix + "_hmmsearch.log")
        if self._hmmdb is None:
            self._hmmdb = os.path.join(pkg_dir, "../db/kofam.hmm")
        if self._force:
            hmm_cls = Hmmsearch(self._faa, self._cpu, self._outdir,
                                self._hmmdb)
            hmm_cls.hmmsearch(hmm_out, hmm_log)
        else:
            if file.isnewer(self._faa, hmm_out):
                hmm_cls = Hmmsearch(self._faa, self._cpu, self._outdir,
                                    self._hmmdb)
                hmm_cls.hmmsearch(hmm_out, hmm_log)
            else:
                logging.info(
                    "Skip hmmsearch because {} is newer than {}, add '--force' if you want to rerun the computation"
                    .format(hmm_out, self._faa))

        # parse KO, the result is under dir(ourdir + "ko_parsing")
        logging.info("Parsing KO")
        file.isdir(os.path.join(self._outdir, "ko_parsing"))
        ko_output = os.path.join(self._outdir, "ko_parsing",
                                 self._prefix + ".ko")
        if self._force:
            parse_cls = ParseKo(self._ko_anno_tool, self._gene_predict_tool,
                                self._fp, hmm_out, self._outdir)
            parse_cls.write_out(ko_output)
            d_nuc_ko = parse_cls.parse_kohmm()
            d_ko_position, d_position_gene = (parse_cls.parseKo())[1:]
        else:
            if file.isnewer(hmm_out, ko_output):
                parse_cls = ParseKo(self._ko_anno_tool,
                                    self._gene_predict_tool, self._fp, hmm_out,
                                    self._outdir)
                parse_cls.write_out(ko_output)
                d_nuc_ko = parse_cls.parse_kohmm()
                d_ko_position, d_position_gene = (parse_cls.parseKo())[1:]
            else:
                logging.info(
                    "Skip parsing KO because {} is newer than {}, add '--force' if you want to rerun the computation"
                    .format(ko_output, hmm_out))
                parse_cls = ParseKo(self._ko_anno_tool,
                                    self._gene_predict_tool, self._fp, hmm_out,
                                    self._outdir)
                d_nuc_ko = parse_cls.parse_kohmm()
                d_ko_position, d_position_gene = (parse_cls.parseKo())[1:]

        # Assigning KM
        logging.info("Assigning KM")
        file.isdir(os.path.join(self._outdir, "KM_assignment_unfiltered"))
        help_graphs = os.path.join(pkg_dir, '../help_files/graphs.pkl')
        help_classes = os.path.join(pkg_dir,
                                    '../help_files/all_pathways_class.txt')
        help_names = os.path.join(pkg_dir,
                                  '../help_files/all_pathways_names.txt')
        graphs, pathway_names, pathway_classes = iqkm.give_pathways_weight.download_pathways(
            help_graphs, help_names, help_classes)
        kegg_output = os.path.join(self._outdir, "KM_assignment_unfiltered",
                                   self._prefix + '.summary.kegg')
        # COMMON INFO
        using_graphs = copy.deepcopy(graphs)
        kegg_output_pathway = kegg_output + '_pathways.tsv'
        if self._force:
            edges, dict_KO_by_contigs = iqkm.give_pathways_weight.get_list_items(
                ko_output)
            file_out_summary = open(kegg_output_pathway, "wt")
            iqkm.give_pathways_weight.set_headers(file_out_summary, False)
            weights_of_KOs = iqkm.give_pathways_weight.get_weights_for_KOs(
                using_graphs)
            iqkm.give_pathways_weight.sort_out_pathways(
                using_graphs, edges, pathway_names, pathway_classes, '',
                file_out_summary, weights_of_KOs, self._include_weights)
            file_out_summary.close()
        else:
            if file.isnewer(ko_output, kegg_output_pathway):
                edges, dict_KO_by_contigs = iqkm.give_pathways_weight.get_list_items(
                    ko_output)
                file_out_summary = open(kegg_output_pathway, "wt")
                iqkm.give_pathways_weight.set_headers(file_out_summary, False)
                weights_of_KOs = iqkm.give_pathways_weight.get_weights_for_KOs(
                    using_graphs)
                iqkm.give_pathways_weight.sort_out_pathways(
                    using_graphs, edges, pathway_names, pathway_classes, '',
                    file_out_summary, weights_of_KOs, self._include_weights)
                file_out_summary.close()
            else:
                logging.info(
                    "Skip KM assignment because {} is newer than {}, add '--force' if you want to rerun the computation"
                    .format(kegg_output_pathway, ko_output))

        # BY CONTIGS
        kegg_output_contig = kegg_output + '_contigs.tsv'
        if self._force:
            graphs, pathway_names, pathway_classes = iqkm.give_pathways_weight.download_pathways(
                help_graphs, help_names, help_classes)
            edges, dict_KO_by_contigs = iqkm.give_pathways_weight.get_list_items(
                ko_output)
            file_out_summary = open(kegg_output_contig, "wt")
            iqkm.give_pathways_weight.set_headers(file_out_summary, True)
            for contig in dict_KO_by_contigs:
                using_graphs = copy.deepcopy(graphs)
                edges = dict_KO_by_contigs[contig]
                iqkm.give_pathways_weight.sort_out_pathways(
                    using_graphs, edges, pathway_names, pathway_classes,
                    contig, file_out_summary, weights_of_KOs,
                    self._include_weights)
            file_out_summary.close()
        else:
            if file.isnewer(ko_output, kegg_output_contig):
                graphs, pathway_names, pathway_classes = iqkm.give_pathways_weight.download_pathways(
                    help_graphs, help_names, help_classes)
                edges, dict_KO_by_contigs = iqkm.give_pathways_weight.get_list_items(
                    ko_output)
                file_out_summary = open(kegg_output_contig, "wt")
                iqkm.give_pathways_weight.set_headers(file_out_summary, True)
                for contig in dict_KO_by_contigs:
                    using_graphs = copy.deepcopy(graphs)
                    edges = dict_KO_by_contigs[contig]
                    iqkm.give_pathways_weight.sort_out_pathways(
                        using_graphs, edges, pathway_names, pathway_classes,
                        contig, file_out_summary, weights_of_KOs,
                        self._include_weights)
                file_out_summary.close()
            else:
                logging.info(
                    "Skip KM assignment because {} is newer than {}, add '--force' if you want to rerun the computation"
                    .format(kegg_output_contig, ko_output))

        # calculate the minimum dist, and apply dist threshold or not
        logging.info("Calculating minimum distance within each KM")
        km = KM_dist(kegg_output_contig, self._com, self._ko_anno_tool,
                     self._gene_predict_tool, hmm_out, self._fp, self._cpu,
                     self._dist, self._outdir)
        file.isdir(os.path.join(args.outdir, "KM_assignment_filtered"))
        parse = ParseKo(args.tool, args.gene_predict_tool, args.gff_faa,
                        args.ko_annotation_result, args.outdir)
        d_ko_position = (parse.parseKo())[1]
        out_dist = os.path.join(args.outdir, "KM_assignment_filtered",
                                self._prefix + "_dist.tsv")
        km.km_dist(d_ko_position, out_dist)
Exemplo n.º 5
0
    def __init__(
        self,
        ffn,
        fsq1,
        fsq2,
        faa,
        db,
        fp,
        gene_predict_tool,
        prefix,
        GE,
        ko_anno_tool="hmmsearch",
        force=False,
        dist=True,
        com=66.67,
        include_weights=True,
        cpu=1,
        outdir="./out",
    ):

        self._ffn = ffn
        self._fq1 = fsq1
        self._fq2 = fsq2
        self._faa = faa
        self._hmmdb = db
        self._fp = fp
        self._prefix = prefix
        self._GE = GE
        self._cpu = cpu
        self._force = force
        self._dist = dist
        self._com = com
        self._include_weights = include_weights
        self._gene_predict_tool = gene_predict_tool
        self._ko_anno_tool = ko_anno_tool
        self._outdir = outdir

        if self._prefix is None:
            self._prefix = ".".join(
                (os.path.basename(self._faa)).split(".")[:-1])
        if self._fp is None:
            if self._gene_predict_tool == "prodigal":
                self._fp = self._faa
            else:
                logging.error(
                    "Please provide gene prediction file (prokka output *.gff)"
                )

        pkg_dir = os.path.dirname(os.path.abspath(__file__))

        # run bwa to remap reads to *.ffn to quantify genes/KOs
        logging.info("Run remapping to quantify genes/KOs")
        file.isdir(os.path.join(self._outdir), "out_remap")
        remap_dir = os.path.join((self._outdir), "out_remap")
        remap_out = os.path.join(remap_dir, self._prefix + "_unique.tab")
        if self._force:
            remap_cls = Remapping(self._ffn, self._fq1, self._fq2, remap_dir,
                                  self._prefix, self._cpu)
            remap_cls.remapping()
        else:
            if file.isnewer(self._ffn, remap_out) or file.isnewer(
                    self._fq1, remap_out):
                remap_cls = Remapping(self._ffn, self._fq1, self._fq2,
                                      remap_dir, self._prefix, self._cpu)
                remap_cls.remapping()
            else:
                logging.info(
                    "Skip remapping because {} and {} is newer than {}, add '--force' if you want to rerun the computation"
                    .format(self._ffn, self._fq1, remap_out))

        # run hmmsearch
        logging.info("Running hmmsearch")
        file.isdir(os.path.join(self._outdir, "hmmsearch"))
        hmm_out = os.path.join(self._outdir, "hmmsearch",
                               self._prefix + "_hmmsearch.tbl")
        hmm_log = os.path.join(self._outdir, "hmmsearch",
                               self._prefix + "_hmmsearch.log")
        if self._hmmdb is None:
            self._hmmdb = os.path.join(pkg_dir, "../db/kofam.hmm")
        if self._force:
            hmm_cls = Hmmsearch(self._faa, self._cpu, self._outdir,
                                self._hmmdb)
            hmm_cls.hmmsearch(hmm_out, hmm_log)
        else:
            if file.isnewer(self._faa, hmm_out):
                hmm_cls = Hmmsearch(self._faa, self._cpu, self._outdir,
                                    self._hmmdb)
                hmm_cls.hmmsearch(hmm_out, hmm_log)
            else:
                logging.info(
                    "Skip hmmsearch because {} is newer than {}, add '--force' if you want to rerun the computation"
                    .format(hmm_out, self._faa))

        # parse KO, the result is under dir(ourdir + "ko_parsing")
        logging.info("Parsing KO")
        file.isdir(os.path.join(self._outdir, "ko_parsing"))
        ko_output = os.path.join(self._outdir, "ko_parsing",
                                 self._prefix + ".ko")
        if self._force:
            parse_cls = ParseKo(
                self._ko_anno_tool,
                self._gene_predict_tool,
                self._fp,
                hmm_out,
                self._outdir,
            )
            parse_cls.write_out(ko_output)
            d_nuc_ko = parse_cls.parse_kohmm()
            d_ko_position, d_position_gene = (parse_cls.parseKo())[1:]
        else:
            if file.isnewer(hmm_out, ko_output):
                parse_cls = ParseKo(
                    self._ko_anno_tool,
                    self._gene_predict_tool,
                    self._fp,
                    hmm_out,
                    self._outdir,
                )
                parse_cls.write_out(ko_output)
                d_nuc_ko = parse_cls.parse_kohmm()
                d_ko_position, d_position_gene = (parse_cls.parseKo())[1:]
            else:
                logging.info(
                    "Skip parsing KO because {} is newer than {}, add '--force' if you want to rerun the computation"
                    .format(ko_output, hmm_out))
                parse_cls = ParseKo(
                    self._ko_anno_tool,
                    self._gene_predict_tool,
                    self._fp,
                    hmm_out,
                    self._outdir,
                )
                d_nuc_ko = parse_cls.parse_kohmm()
                d_ko_position, d_position_gene = (parse_cls.parseKo())[1:]

        # Assigning KM
        logging.info("Assigning KM")
        file.isdir(os.path.join(self._outdir, "KM_assignment_unfiltered"))
        help_graphs = os.path.join(pkg_dir, "../help_files/graphs.pkl")
        help_classes = os.path.join(pkg_dir,
                                    "../help_files/all_pathways_class.txt")
        help_names = os.path.join(pkg_dir,
                                  "../help_files/all_pathways_names.txt")
        (
            graphs,
            pathway_names,
            pathway_classes,
        ) = iqkm.give_pathways_weight.download_pathways(
            help_graphs, help_names, help_classes)
        kegg_output = os.path.join(self._outdir, "KM_assignment_unfiltered",
                                   self._prefix + ".summary.kegg")
        # COMMON INFO
        using_graphs = copy.deepcopy(graphs)
        kegg_output_pathway = kegg_output + "_pathways.tsv"
        if self._force:
            edges, dict_KO_by_contigs = iqkm.give_pathways_weight.get_list_items(
                ko_output)
            file_out_summary = open(kegg_output_pathway, "wt")
            iqkm.give_pathways_weight.set_headers(file_out_summary, False)
            weights_of_KOs = iqkm.give_pathways_weight.get_weights_for_KOs(
                using_graphs)
            iqkm.give_pathways_weight.sort_out_pathways(
                using_graphs,
                edges,
                pathway_names,
                pathway_classes,
                "",
                file_out_summary,
                weights_of_KOs,
                self._include_weights,
            )
            file_out_summary.close()
        else:
            if file.isnewer(ko_output, kegg_output_pathway):
                edges, dict_KO_by_contigs = iqkm.give_pathways_weight.get_list_items(
                    ko_output)
                file_out_summary = open(kegg_output_pathway, "wt")
                iqkm.give_pathways_weight.set_headers(file_out_summary, False)
                weights_of_KOs = iqkm.give_pathways_weight.get_weights_for_KOs(
                    using_graphs)
                iqkm.give_pathways_weight.sort_out_pathways(
                    using_graphs,
                    edges,
                    pathway_names,
                    pathway_classes,
                    "",
                    file_out_summary,
                    weights_of_KOs,
                    self._include_weights,
                )
                file_out_summary.close()
            else:
                logging.info(
                    "Skip KM assignment because {} is newer than {}, add '--force' if you want to rerun the computation"
                    .format(kegg_output_pathway, ko_output))

        # BY CONTIGS
        kegg_output_contig = kegg_output + "_contigs.tsv"
        if self._force:
            (
                graphs,
                pathway_names,
                pathway_classes,
            ) = iqkm.give_pathways_weight.download_pathways(
                help_graphs, help_names, help_classes)
            edges, dict_KO_by_contigs = iqkm.give_pathways_weight.get_list_items(
                ko_output)
            file_out_summary = open(kegg_output_contig, "wt")
            iqkm.give_pathways_weight.set_headers(file_out_summary, True)
            for contig in dict_KO_by_contigs:
                using_graphs = copy.deepcopy(graphs)
                edges = dict_KO_by_contigs[contig]
                iqkm.give_pathways_weight.sort_out_pathways(
                    using_graphs,
                    edges,
                    pathway_names,
                    pathway_classes,
                    contig,
                    file_out_summary,
                    weights_of_KOs,
                    self._include_weights,
                )
            file_out_summary.close()
        else:
            if file.isnewer(ko_output, kegg_output_contig):
                (
                    graphs,
                    pathway_names,
                    pathway_classes,
                ) = iqkm.give_pathways_weight.download_pathways(
                    help_graphs, help_names, help_classes)
                edges, dict_KO_by_contigs = iqkm.give_pathways_weight.get_list_items(
                    ko_output)
                file_out_summary = open(kegg_output_contig, "wt")
                iqkm.give_pathways_weight.set_headers(file_out_summary, True)
                for contig in dict_KO_by_contigs:
                    using_graphs = copy.deepcopy(graphs)
                    edges = dict_KO_by_contigs[contig]
                    iqkm.give_pathways_weight.sort_out_pathways(
                        using_graphs,
                        edges,
                        pathway_names,
                        pathway_classes,
                        contig,
                        file_out_summary,
                        weights_of_KOs,
                        self._include_weights,
                    )
                file_out_summary.close()
            else:
                logging.info(
                    "Skip KM assignment because {} is newer than {}, add '--force' if you want to rerun the computation"
                    .format(kegg_output_contig, ko_output))

        # calculate the minimum dist within each KM, normalized KM abundance (with GE) or non-normalized KM abundance, both on contig and sample basis
        logging.info("Calculating minimum distance within KM and KM abundance")
        file.isdir(os.path.join(self._outdir, "out_abundance"))
        file.isdir(os.path.join(self._outdir, "out_abundance", "ko_abd"))
        file.isdir(os.path.join(self._outdir, "out_abundance",
                                "km_abd_sample"))
        file.isdir(os.path.join(self._outdir, "out_abundance",
                                "km_abd_contig"))
        output_ko = os.path.join(self._outdir, "out_abundance", "ko_abd",
                                 self._prefix + "_ko_abd.tsv")
        output_km_contig = os.path.join(
            self._outdir,
            "out_abundance",
            "km_abd_contig",
            self._prefix + "_km_contig_abd.tsv",
        )
        out_km_sample = os.path.join(
            self._outdir,
            "out_abundance",
            "km_abd_sample",
            self._prefix + "_km_sample_abd.tsv",
        )

        abd_cls = KM_abd(
            self._GE,
            remap_out,
            kegg_output_contig,
            self._com,
            self._ko_anno_tool,
            self._gene_predict_tool,
            hmm_out,
            self._fp,
            self._dist,
            self._outdir,
        )
        abd_cls.km_abd(
            d_nuc_ko,
            d_ko_position,
            d_position_gene,
            output_ko,
            output_km_contig,
            out_km_sample,
        )