def ice_quiver_postprocess_a_cluster_bin(cluster_out_dir, ipq_opts):
    """
    ice_quiver_postprocess a cluster bin, create summary.json,
    cluster_report.csv, hq|lq_isoforms.fa|fq.
    return hq.fq and lq.fq
    Parameters:
      cluster_out_dir - root dir running ICE, ice_partial and ice_quiver
                        for this cluster bin.
    """
    _jobs_log = op.join(cluster_out_dir, "log", "submitted_quiver_jobs.txt")
    shell_scripts = []
    for file_name in os.listdir(op.join(cluster_out_dir, "quivered")):
        if file_name.endswith(".sh"):
            shell_scripts.append(file_name)
    with open(_jobs_log, 'w') as f:
        f.write("\n".join(["\t".join(["local", s]) for s in shell_scripts]))

    icep = IceQuiverPostprocess(root_dir=cluster_out_dir,
                                ipq_opts=ipq_opts)
    for file_name in os.listdir(op.join(cluster_out_dir, "quivered")):
        if file_name.endswith(".sh"):
            shell_scripts.append(file_name)

    icep.run()
    return (icep.quivered_good_fq, icep.quivered_bad_fq)
Exemplo n.º 2
0
    def run(self):
        """Run"""
        iceq = IceQuiver(root_dir=self.root_dir, bas_fofn=self.bas_fofn,
                         fasta_fofn=self.fasta_fofn, sge_opts=self.sge_opts,
                         tmp_dir=self.tmp_dir)
        iceq.validate_inputs()
        iceq.run()

        icepq = IceQuiverPostprocess(root_dir=self.root_dir,
                                     use_sge=self.sge_opts.use_sge,
                                     quit_if_not_done=False,
                                     ipq_opts=self.ipq_opts)
        icepq.run()
        return 0
Exemplo n.º 3
0
    def run(self):
        """Run"""
        iceq = IceQuiver(root_dir=self.root_dir,
                         bas_fofn=self.bas_fofn,
                         fasta_fofn=self.fasta_fofn,
                         sge_opts=self.sge_opts,
                         tmp_dir=self.tmp_dir)
        iceq.validate_inputs()
        iceq.run()

        icepq = IceQuiverPostprocess(root_dir=self.root_dir,
                                     use_sge=self.sge_opts.use_sge,
                                     quit_if_not_done=False,
                                     ipq_opts=self.ipq_opts)
        icepq.run()
        return 0
Exemplo n.º 4
0
    def __init__(self, combined_dir, sample_name, split_dirs, ipq_opts):
        super(CombineRunner, self).__init__(combined_dir=combined_dir)

        self.sample_name = sample_name
        self.split_dirs = split_dirs
        self.split_indices = range(0, len(split_dirs))

        self.hq_fq_fns, self.hq_fa_fns = [], []
        self.lq_fq_fns, self.lq_fa_fns = [], []
        self.consensus_isoforms_fns = [
        ]  # unpolished consensus isoforms in split dirs.
        self.uc_pickle_fns, self.partial_uc_pickle_fns = [], [
        ]  # uc pickles and partial pickles
        for split_dir in self.split_dirs:
            ipq_f = IceQuiverPostprocess(root_dir=split_dir,
                                         ipq_opts=ipq_opts,
                                         no_log_f=True,
                                         make_dirs=False)
            self.hq_fq_fns.append(ipq_f.quivered_good_fq)
            self.hq_fa_fns.append(ipq_f.quivered_good_fa)
            self.lq_fq_fns.append(ipq_f.quivered_bad_fq)
            self.lq_fa_fns.append(ipq_f.quivered_bad_fa)
            self.consensus_isoforms_fns.append(ipq_f.final_consensus_fa)
            self.uc_pickle_fns.append(ipq_f.final_pickle_fn)
            self.partial_uc_pickle_fns.append(ipq_f.nfl_all_pickle_fn)
Exemplo n.º 5
0
def resolved_tool_contract_runner(rtc):
    """
    For each cluster bin, create summary.json, cluster_report.csv,
    hq_isoforms.fa|fq, lq_isoforms.fa|fq
    Finally, merge all cluster bins and save all outputs to 'combined'.
    """
    p = ChunkTasksPickle.read(rtc.task.input_files[0])
    assert all([isinstance(task, ClusterChunkTask) for task in p])
    p.sorted_by_attr(attr='cluster_bin_index')

    opts = rtc.task.options
    ipq_opts = IceQuiverHQLQOptions(
        qv_trim_5=opts[Constants.QV_TRIM_FIVEPRIME_ID],
        qv_trim_3=opts[Constants.QV_TRIM_THREEPRIME_ID],
        hq_quiver_min_accuracy=opts[Constants.HQ_QUIVER_MIN_ACCURACY_ID])
    sample_name = get_sample_name(
        input_sample_name=opts[Constants.SAMPLE_NAME_ID])

    out_consensus_isoforms_cs = rtc.task.output_files[0]
    out_summary = rtc.task.output_files[1]
    out_report = rtc.task.output_files[2]
    out_hq_cs = rtc.task.output_files[3]
    out_hq_fq = rtc.task.output_files[4]
    out_lq_cs = rtc.task.output_files[5]
    out_lq_fq = rtc.task.output_files[6]
    out_hq_lq_prefix_dict_pickle = rtc.task.output_files[7]

    assert out_consensus_isoforms_cs.endswith(".contigset.xml")
    assert out_hq_cs.endswith(".contigset.xml")
    assert out_lq_cs.endswith(".contigset.xml")
    out_consensus_isoforms_fa = out_consensus_isoforms_cs.replace(
        ".contigset.xml", ".fasta")
    out_hq_fa = out_hq_cs.replace('.contigset.xml', '.fasta')
    out_lq_fa = out_lq_cs.replace('.contigset.xml', '.fasta')

    hq_fq_fns, lq_fq_fns = [], []
    split_uc_pickles, split_partial_uc_pickles = [], []
    split_consensus_isoforms = []

    cluster_bin_indices = [task.cluster_bin_index for task in p]
    cluster_out_dirs = [task.cluster_out_dir for task in p]
    # sanity check that Cluster indices are unique!
    assert len(set(cluster_bin_indices)) == len(cluster_bin_indices)

    for task in p:
        ice_pq = IceQuiverPostprocess(root_dir=task.cluster_out_dir,
                                      ipq_opts=ipq_opts)
        hq_fq_fns.append(ice_pq.quivered_good_fq)
        lq_fq_fns.append(ice_pq.quivered_bad_fq)
        split_uc_pickles.append(ice_pq.final_pickle_fn)
        split_partial_uc_pickles.append(ice_pq.nfl_all_pickle_fn)
        split_consensus_isoforms.append(ice_pq.final_consensus_fa)

    combined_dir = op.join(op.dirname(op.dirname(cluster_out_dirs[0])),
                           "combined")
    mkdir(combined_dir)
    combined_files = CombinedFiles(combined_dir)
    log.info("Combining results of all cluster bins to %s.", combined_dir)
    log.info("Merging HQ|LQ isoforms from all cluster bins.")
    log.info("HQ isoforms are: %s.", ",".join(hq_fq_fns))
    log.info("LQ isoforms are: %s.", ",".join(lq_fq_fns))
    combine_polished_isoforms(
        split_indices=cluster_bin_indices,
        split_hq_fns=hq_fq_fns,
        split_lq_fns=lq_fq_fns,
        combined_hq_fa=combined_files.all_hq_fa,
        combined_hq_fq=combined_files.all_hq_fq,
        combined_lq_fa=combined_files.all_lq_fa,
        combined_lq_fq=combined_files.all_lq_fq,
        hq_lq_prefix_dict_pickle=combined_files.hq_lq_prefix_dict_pickle,
        sample_name=sample_name)

    ln(combined_files.all_hq_fa, out_hq_fa)  #'HQ isoforms'
    ln(combined_files.all_hq_fq, out_hq_fq)  #'HQ isoforms'
    ln(combined_files.all_lq_fa, out_lq_fa)  #'LQ isoforms'
    ln(combined_files.all_lq_fq, out_lq_fq)  #'LQ isoforms'
    ln(combined_files.hq_lq_prefix_dict_pickle, out_hq_lq_prefix_dict_pickle)

    as_contigset(out_hq_fa, out_hq_cs)
    as_contigset(out_lq_fa, out_lq_cs)

    log.info("Merging consensus isoforms from all cluster bins.")
    combine_consensus_isoforms(split_indices=cluster_bin_indices,
                               split_files=split_consensus_isoforms,
                               combined_consensus_isoforms_fa=combined_files.
                               all_consensus_isoforms_fa,
                               sample_name=sample_name)
    ln(combined_files.all_consensus_isoforms_fa, out_consensus_isoforms_fa)
    #consensus isoforms
    as_contigset(out_consensus_isoforms_fa, out_consensus_isoforms_cs)

    log.info("Writing cluster summary to %s",
             combined_files.all_cluster_summary_fn)
    write_cluster_summary(summary_fn=combined_files.all_cluster_summary_fn,
                          isoforms_fa=out_consensus_isoforms_cs,
                          hq_fa=out_hq_fa,
                          lq_fa=out_lq_fa)
    ln(combined_files.all_cluster_summary_fn, out_summary)  # "cluster summary"

    log.info("Writing cluster report to %s",
             combined_files.all_cluster_report_fn)
    write_combined_cluster_report(
        split_indices=cluster_bin_indices,
        split_uc_pickles=split_uc_pickles,
        split_partial_uc_pickles=split_partial_uc_pickles,
        report_fn=combined_files.all_cluster_report_fn,
        sample_name=sample_name)
    ln(combined_files.all_cluster_report_fn, out_report)  # "cluster report"
Exemplo n.º 6
0
 def run(self):
     """Execute ice_quiver.py all|i|merge|postprocess."""
     cmd = self.args.subCommand
     logging.info("Running {f} {cmd} v{v}.".format(f=op.basename(__file__),
                                                   cmd=cmd, v=get_version()))
     cmd_str = ""
     try:
         args = self.args
         obj = None
         if cmd == "all":
             sge_opts = SgeOptions(unique_id=args.unique_id,
                                   use_sge=args.use_sge,
                                   max_sge_jobs=args.max_sge_jobs,
                                   blasr_nproc=args.blasr_nproc,
                                   quiver_nproc=args.quiver_nproc)
             ipq_opts = IceQuiverHQLQOptions(
                 hq_isoforms_fa=args.hq_isoforms_fa,
                 hq_isoforms_fq=args.hq_isoforms_fq,
                 lq_isoforms_fa=args.lq_isoforms_fa,
                 lq_isoforms_fq=args.lq_isoforms_fq,
                 qv_trim_5=args.qv_trim_5,
                 qv_trim_3=args.qv_trim_3,
                 hq_quiver_min_accuracy=args.hq_quiver_min_accuracy)
             obj = IceQuiverAll(root_dir=args.root_dir,
                                bas_fofn=args.bas_fofn,
                                fasta_fofn=None,
                                sge_opts=sge_opts,
                                ipq_opts=ipq_opts,
                                tmp_dir=args.tmp_dir)
         elif cmd == "i":
             sge_opts = SgeOptions(unique_id=args.unique_id,
                                   use_sge=args.use_sge,
                                   max_sge_jobs=args.max_sge_jobs,
                                   blasr_nproc=args.blasr_nproc,
                                   quiver_nproc=args.quiver_nproc)
             obj = IceQuiverI(root_dir=args.root_dir, i=args.i, N=args.N,
                              bas_fofn=args.bas_fofn,
                              fasta_fofn=None,
                              sge_opts=sge_opts,
                              tmp_dir=args.tmp_dir)
         elif cmd == "merge":
             obj = IceQuiverMerge(root_dir=args.root_dir, N=args.N)
         elif cmd == "postprocess":
             ipq_opts = IceQuiverHQLQOptions(
                 hq_isoforms_fa=args.hq_isoforms_fa,
                 hq_isoforms_fq=args.hq_isoforms_fq,
                 lq_isoforms_fa=args.lq_isoforms_fa,
                 lq_isoforms_fq=args.lq_isoforms_fq,
                 qv_trim_5=args.qv_trim_5,
                 qv_trim_3=args.qv_trim_3,
                 hq_quiver_min_accuracy=args.hq_quiver_min_accuracy)
             obj = IceQuiverPostprocess(root_dir=args.root_dir,
                                        ipq_opts=ipq_opts,
                                        use_sge=args.use_sge,
                                        quit_if_not_done=args.quit_if_not_done,
                                        summary_fn=args.summary_fn,
                                        report_fn=args.report_fn)
         else:
             raise ValueError("Unknown command passed to {f}: {cmd}.".
                              format(f=op.basename(__file__), cmd=cmd))
         cmd_str = obj.cmd_str()
         logging.info("Running CMD: {cmd_str}".format(cmd_str=cmd_str))
         obj.run()
     except:
         logging.exception("Exiting {cmd_str} with return code 1.".
                           format(cmd_str=cmd_str))
         return 1
     return 0
Exemplo n.º 7
0
    def run(self):
        """
        First, split non-full-length (nfl) fasta files into smaller
        chunks, assign nfl reads in each splitted fasta file
        into unpolished isoform clusters and then merge all pickles
        into self.nfl_all_pickle_fn.
        Second, bin every 100 clusters, for each bin, call blasr,
        samto5h, loadPulses, cmph5tools to create cmp.h5 files and
        call quiver to polish each isoforms within each bin.
        Finally, pick up good isoform clusters whose QV errors is less
        than a threshold.
        Save all high quality isoforms to hq_isoforms_fa|fq if they are not None
        Save all low quality isoforms to lq_isoforms_fa|fq if they are not None
        """
        if guess_file_format(self.bas_fofn) != FILE_FORMATS.BAM:
            # Create input.fasta.fofn from bas_fofn
            self.add_log("Creating fasta fofn from bas/bax.h5/bam fofn",
                         level=logging.INFO)
            if self.fasta_fofn is None:
                self.fasta_fofn = op.join(self.nfl_dir, "input.fasta.fofn")
            self.add_log("fasta fofn={f}".format(f=self.fasta_fofn))
            convert_fofn_to_fasta(fofn_filename=self.bas_fofn,
                                  out_filename=self.fasta_fofn,
                                  fasta_out_dir=self.nfl_dir)
        else:
            self.fasta_fofn = None

        # Split non-full-length reads into smaller fasta files
        # and save files to root_dir/nfl_00.fasta, ..., .
        self.add_log("Splitting {nfl} into ".format(nfl=self.nfl_fa) +
                     "smaller files each containing {n} reads.".format(
                         n=self.ice_opts.nfl_reads_per_split),
                     level=logging.INFO)
        self._nfl_splitted_fas = splitFasta(
            input_fasta=self.nfl_fa,
            reads_per_split=self.ice_opts.nfl_reads_per_split,
            out_dir=self.nfl_dir,
            out_prefix="input.split")
        msg = "Splitted files are: " + "\n".join(self._nfl_splitted_fas)
        self.add_log(msg, level=logging.INFO)

        # Generating dazz DB for final.consensus.fasta
        ref_obj = DazzIDHandler(input_filename=self.final_consensus_fa,
                                converted=False)
        ref_obj.make_db()
        msg = "Dazz DB made for: " + ref_obj.dazz_filename
        self.add_log(msg, level=logging.INFO)

        # Process nfl reads in each splitted fasta.
        self.add_log("Initializing IceAllPartials.", level=logging.INFO)

        self.icep = IceAllPartials(root_dir=self.root_dir,
                                   fasta_filenames=self._nfl_splitted_fas,
                                   ref_fasta=self.final_consensus_fa,
                                   out_pickle=self.nfl_all_pickle_fn,
                                   sge_opts=self.sge_opts,
                                   ccs_fofn=self.ccs_fofn)
        self.add_log("IceAllPartials log: {f}.".format(f=self.icep.log_fn),
                     level=logging.INFO)
        self.icep.run()
        self.add_log("IceAllPartials completed.", level=logging.INFO)

        self.add_log("Initializing IceQuiver.", level=logging.INFO)
        self.iceq = IceQuiver(root_dir=self.root_dir,
                              bas_fofn=self.bas_fofn,
                              fasta_fofn=self.fasta_fofn,
                              sge_opts=self.sge_opts,
                              tmp_dir=self.tmp_dir)
        self.add_log("IceQuiver log: {f}.".format(f=self.iceq.log_fn),
                     level=logging.INFO)
        self.iceq.run()
        self.add_log("IceQuiver finished.", level=logging.INFO)

        self.add_log("Initializing IceQuiverPostprocess.", level=logging.INFO)
        self.icepq = IceQuiverPostprocess(root_dir=self.root_dir,
                                          use_sge=self.sge_opts.use_sge,
                                          quit_if_not_done=False,
                                          ipq_opts=self.ipq_opts)
        self.add_log(
            "IceQuiverPostprocess log: {f}.".format(f=self.icepq.log_fn),
            level=logging.INFO)
        self.icepq.run()
        self.add_log("IceQuiverPostprocess finished.", level=logging.INFO)
Exemplo n.º 8
0
class Polish(IceFiles):
    """Polish isoforms clusters using Quiver."""
    def __init__(self,
                 root_dir,
                 nfl_fa,
                 bas_fofn,
                 ccs_fofn,
                 ice_opts,
                 sge_opts,
                 ipq_opts,
                 fasta_fofn=None,
                 tmp_dir=None):
        """
        root_dir --- IceFiles.root_dir, usually data/clusterOutDir
        nfl_fa    --- non-full-length reads in fasta, e.g., isoseq_nfl.fasta
        bas_fofn --- e.g. input.fofn of bas|bax.h5 files
        ccs_fofn --- e.g. ccs.fofn of ccs files.

        ipq_opts --- IceQuiverHQLQOptions
                     qv_trim_5: ignore QV of n bases in the 5' end
                     qv_trim_3: ignore QV of n bases in the 3' end
                     hq_quiver_min_accuracy: minimum allowed quiver accuracy
                                      to mark an isoform as high quality
                     hq_isoforms_fa|fq: polished, hiqh quality consensus
                                        isoforms in fasta|q
                     lq_isoforms_fa|fq: polished, low quality consensus
                                        isoforms in fasta|q
        """
        IceFiles.__init__(self,
                          prog_name="IcePolish",
                          root_dir=root_dir,
                          bas_fofn=bas_fofn,
                          ccs_fofn=ccs_fofn,
                          fasta_fofn=fasta_fofn,
                          tmp_dir=tmp_dir)
        self.nfl_fa = realpath(nfl_fa)
        self.ice_opts = ice_opts
        self.sge_opts = sge_opts
        self.ipq_opts = ipq_opts

        self.add_log("ece_penalty: {0}, ece_min_len: {1}".format(
            self.ice_opts.ece_penalty, self.ice_opts.ece_min_len))

        self.icep = None  # IceAllPartials.
        self.iceq = None  # IceQuiver
        self.icepq = None  # IceQuiverPostprocess
        self._nfl_splitted_fas = None

        self.validate_inputs()

    def validate_inputs(self):
        """
        Validate input directories: root_dir, and
        files: nfl_fa, bas_fofn, ccs_fofn.
        """
        self.add_log("Validating inputs.")
        errMsg = ""
        if not op.exists(self.root_dir):
            errMsg = "Root dir {d} is not an existing directory!".\
                format(d=self.root_dir)
        if not op.exists(self.nfl_fa):
            errMsg = "Failed to find non-full-length reads {f}!".\
                format(f=self.nfl_fa)
        if self.bas_fofn is None:
            errMsg = "bas_fofn (subreadset) must be specified."
        if not op.exists(self.bas_fofn):
            errMsg = "Failed to find bas fofn (subreadset) {f}!".format(
                f=self.bas_fofn)
        if errMsg != "":
            self.add_log(errMsg, level=logging.ERROR)
            raise ValueError(errMsg)

    def run(self):
        """
        First, split non-full-length (nfl) fasta files into smaller
        chunks, assign nfl reads in each splitted fasta file
        into unpolished isoform clusters and then merge all pickles
        into self.nfl_all_pickle_fn.
        Second, bin every 100 clusters, for each bin, call blasr,
        samto5h, loadPulses, cmph5tools to create cmp.h5 files and
        call quiver to polish each isoforms within each bin.
        Finally, pick up good isoform clusters whose QV errors is less
        than a threshold.
        Save all high quality isoforms to hq_isoforms_fa|fq if they are not None
        Save all low quality isoforms to lq_isoforms_fa|fq if they are not None
        """
        if guess_file_format(self.bas_fofn) != FILE_FORMATS.BAM:
            # Create input.fasta.fofn from bas_fofn
            self.add_log("Creating fasta fofn from bas/bax.h5/bam fofn",
                         level=logging.INFO)
            if self.fasta_fofn is None:
                self.fasta_fofn = op.join(self.nfl_dir, "input.fasta.fofn")
            self.add_log("fasta fofn={f}".format(f=self.fasta_fofn))
            convert_fofn_to_fasta(fofn_filename=self.bas_fofn,
                                  out_filename=self.fasta_fofn,
                                  fasta_out_dir=self.nfl_dir)
        else:
            self.fasta_fofn = None

        # Split non-full-length reads into smaller fasta files
        # and save files to root_dir/nfl_00.fasta, ..., .
        self.add_log("Splitting {nfl} into ".format(nfl=self.nfl_fa) +
                     "smaller files each containing {n} reads.".format(
                         n=self.ice_opts.nfl_reads_per_split),
                     level=logging.INFO)
        self._nfl_splitted_fas = splitFasta(
            input_fasta=self.nfl_fa,
            reads_per_split=self.ice_opts.nfl_reads_per_split,
            out_dir=self.nfl_dir,
            out_prefix="input.split")
        msg = "Splitted files are: " + "\n".join(self._nfl_splitted_fas)
        self.add_log(msg, level=logging.INFO)

        # Generating dazz DB for final.consensus.fasta
        ref_obj = DazzIDHandler(input_filename=self.final_consensus_fa,
                                converted=False)
        ref_obj.make_db()
        msg = "Dazz DB made for: " + ref_obj.dazz_filename
        self.add_log(msg, level=logging.INFO)

        # Process nfl reads in each splitted fasta.
        self.add_log("Initializing IceAllPartials.", level=logging.INFO)

        self.icep = IceAllPartials(root_dir=self.root_dir,
                                   fasta_filenames=self._nfl_splitted_fas,
                                   ref_fasta=self.final_consensus_fa,
                                   out_pickle=self.nfl_all_pickle_fn,
                                   sge_opts=self.sge_opts,
                                   ccs_fofn=self.ccs_fofn)
        self.add_log("IceAllPartials log: {f}.".format(f=self.icep.log_fn),
                     level=logging.INFO)
        self.icep.run()
        self.add_log("IceAllPartials completed.", level=logging.INFO)

        self.add_log("Initializing IceQuiver.", level=logging.INFO)
        self.iceq = IceQuiver(root_dir=self.root_dir,
                              bas_fofn=self.bas_fofn,
                              fasta_fofn=self.fasta_fofn,
                              sge_opts=self.sge_opts,
                              tmp_dir=self.tmp_dir)
        self.add_log("IceQuiver log: {f}.".format(f=self.iceq.log_fn),
                     level=logging.INFO)
        self.iceq.run()
        self.add_log("IceQuiver finished.", level=logging.INFO)

        self.add_log("Initializing IceQuiverPostprocess.", level=logging.INFO)
        self.icepq = IceQuiverPostprocess(root_dir=self.root_dir,
                                          use_sge=self.sge_opts.use_sge,
                                          quit_if_not_done=False,
                                          ipq_opts=self.ipq_opts)
        self.add_log(
            "IceQuiverPostprocess log: {f}.".format(f=self.icepq.log_fn),
            level=logging.INFO)
        self.icepq.run()
        self.add_log("IceQuiverPostprocess finished.", level=logging.INFO)
Exemplo n.º 9
0
    def run(self):
        """
        First, split non-full-length (nfl) fasta files into smaller
        chunks, assign nfl reads in each splitted fasta file
        into unpolished isoform clusters and then merge all pickles
        into self.nfl_all_pickle_fn.
        Second, bin every 100 clusters, for each bin, call blasr,
        samto5h, loadPulses, cmph5tools to create cmp.h5 files and
        call quiver to polish each isoforms within each bin.
        Finally, pick up good isoform clusters whose QV errors is less
        than a threshold.
        Save all high quality isoforms to hq_isoforms_fa|fq if they are not None
        Save all low quality isoforms to lq_isoforms_fa|fq if they are not None
        """
        if guess_file_format(self.bas_fofn) != FILE_FORMATS.BAM:
            # Create input.fasta.fofn from bas_fofn
            self.add_log("Creating fasta fofn from bas/bax.h5/bam fofn",
                         level=logging.INFO)
            if self.fasta_fofn is None:
                self.fasta_fofn = op.join(self.nfl_dir, "input.fasta.fofn")
            self.add_log("fasta fofn={f}".format(f=self.fasta_fofn))
            convert_fofn_to_fasta(fofn_filename=self.bas_fofn,
                                  out_filename=self.fasta_fofn,
                                  fasta_out_dir=self.nfl_dir)
        else:
            self.fasta_fofn = None

        # Split non-full-length reads into smaller fasta files
        # and save files to root_dir/nfl_00.fasta, ..., .
        self.add_log("Splitting {nfl} into ".format(nfl=self.nfl_fa) +
                     "smaller files each containing {n} reads.".format(
                     n=self.ice_opts.nfl_reads_per_split),
                     level=logging.INFO)
        self._nfl_splitted_fas = splitFasta(input_fasta=self.nfl_fa,
                                            reads_per_split=self.ice_opts.nfl_reads_per_split,
                                            out_dir=self.nfl_dir,
                                            out_prefix="input.split")
        msg = "Splitted files are: " + "\n".join(self._nfl_splitted_fas)
        self.add_log(msg, level=logging.INFO)

        # Generating dazz DB for final.consensus.fasta
        ref_obj = DazzIDHandler(input_filename=self.final_consensus_fa, converted=False)
        ref_obj.make_db()
        msg = "Dazz DB made for: " + ref_obj.dazz_filename
        self.add_log(msg, level=logging.INFO)

        # Process nfl reads in each splitted fasta.
        self.add_log("Initializing IceAllPartials.", level=logging.INFO)

        self.icep = IceAllPartials(
            root_dir=self.root_dir,
            fasta_filenames=self._nfl_splitted_fas,
            ref_fasta=self.final_consensus_fa,
            out_pickle=self.nfl_all_pickle_fn,
            sge_opts=self.sge_opts,
            ccs_fofn=self.ccs_fofn)
        self.add_log("IceAllPartials log: {f}.".format(f=self.icep.log_fn),
                     level=logging.INFO)
        self.icep.run()
        self.add_log("IceAllPartials completed.", level=logging.INFO)

        self.add_log("Initializing IceQuiver.", level=logging.INFO)
        self.iceq = IceQuiver(root_dir=self.root_dir,
                              bas_fofn=self.bas_fofn,
                              fasta_fofn=self.fasta_fofn,
                              sge_opts=self.sge_opts,
                              tmp_dir=self.tmp_dir)
        self.add_log("IceQuiver log: {f}.".format(f=self.iceq.log_fn),
                     level=logging.INFO)
        self.iceq.run()
        self.add_log("IceQuiver finished.", level=logging.INFO)

        self.add_log("Initializing IceQuiverPostprocess.", level=logging.INFO)
        self.icepq = IceQuiverPostprocess(root_dir=self.root_dir,
                                          use_sge=self.sge_opts.use_sge,
                                          quit_if_not_done=False,
                                          ipq_opts=self.ipq_opts)
        self.add_log("IceQuiverPostprocess log: {f}.".
                     format(f=self.icepq.log_fn), level=logging.INFO)
        self.icepq.run()
        self.add_log("IceQuiverPostprocess finished.", level=logging.INFO)
Exemplo n.º 10
0
class Polish(IceFiles):

    """Polish isoforms clusters using Quiver."""

    def __init__(self, root_dir, nfl_fa, bas_fofn, ccs_fofn,
                 ice_opts, sge_opts, ipq_opts, fasta_fofn=None,
                 tmp_dir=None):
        """
        root_dir --- IceFiles.root_dir, usually data/clusterOutDir
        nfl_fa    --- non-full-length reads in fasta, e.g., isoseq_nfl.fasta
        bas_fofn --- e.g. input.fofn of bas|bax.h5 files
        ccs_fofn --- e.g. ccs.fofn of ccs files.

        ipq_opts --- IceQuiverHQLQOptions
                     qv_trim_5: ignore QV of n bases in the 5' end
                     qv_trim_3: ignore QV of n bases in the 3' end
                     hq_quiver_min_accuracy: minimum allowed quiver accuracy
                                      to mark an isoform as high quality
                     hq_isoforms_fa|fq: polished, hiqh quality consensus
                                        isoforms in fasta|q
                     lq_isoforms_fa|fq: polished, low quality consensus
                                        isoforms in fasta|q
        """
        IceFiles.__init__(self, prog_name="IcePolish", root_dir=root_dir,
                          bas_fofn=bas_fofn, ccs_fofn=ccs_fofn,
                          fasta_fofn=fasta_fofn, tmp_dir=tmp_dir)
        self.nfl_fa = realpath(nfl_fa)
        self.ice_opts = ice_opts
        self.sge_opts = sge_opts
        self.ipq_opts = ipq_opts

        self.add_log("ece_penalty: {0}, ece_min_len: {1}".format(self.ice_opts.ece_penalty, self.ice_opts.ece_min_len))

        self.icep = None   # IceAllPartials.
        self.iceq = None   # IceQuiver
        self.icepq = None  # IceQuiverPostprocess
        self._nfl_splitted_fas = None

        self.validate_inputs()

    def validate_inputs(self):
        """
        Validate input directories: root_dir, and
        files: nfl_fa, bas_fofn, ccs_fofn.
        """
        self.add_log("Validating inputs.")
        errMsg = ""
        if not op.exists(self.root_dir):
            errMsg = "Root dir {d} is not an existing directory!".\
                format(d=self.root_dir)
        if not op.exists(self.nfl_fa):
            errMsg = "Failed to find non-full-length reads {f}!".\
                format(f=self.nfl_fa)
        if self.bas_fofn is None:
            errMsg = "bas_fofn (subreadset) must be specified."
        if not op.exists(self.bas_fofn):
            errMsg = "Failed to find bas fofn (subreadset) {f}!".format(f=self.bas_fofn)
        if errMsg != "":
            self.add_log(errMsg, level=logging.ERROR)
            raise ValueError(errMsg)

    def run(self):
        """
        First, split non-full-length (nfl) fasta files into smaller
        chunks, assign nfl reads in each splitted fasta file
        into unpolished isoform clusters and then merge all pickles
        into self.nfl_all_pickle_fn.
        Second, bin every 100 clusters, for each bin, call blasr,
        samto5h, loadPulses, cmph5tools to create cmp.h5 files and
        call quiver to polish each isoforms within each bin.
        Finally, pick up good isoform clusters whose QV errors is less
        than a threshold.
        Save all high quality isoforms to hq_isoforms_fa|fq if they are not None
        Save all low quality isoforms to lq_isoforms_fa|fq if they are not None
        """
        if guess_file_format(self.bas_fofn) != FILE_FORMATS.BAM:
            # Create input.fasta.fofn from bas_fofn
            self.add_log("Creating fasta fofn from bas/bax.h5/bam fofn",
                         level=logging.INFO)
            if self.fasta_fofn is None:
                self.fasta_fofn = op.join(self.nfl_dir, "input.fasta.fofn")
            self.add_log("fasta fofn={f}".format(f=self.fasta_fofn))
            convert_fofn_to_fasta(fofn_filename=self.bas_fofn,
                                  out_filename=self.fasta_fofn,
                                  fasta_out_dir=self.nfl_dir)
        else:
            self.fasta_fofn = None

        # Split non-full-length reads into smaller fasta files
        # and save files to root_dir/nfl_00.fasta, ..., .
        self.add_log("Splitting {nfl} into ".format(nfl=self.nfl_fa) +
                     "smaller files each containing {n} reads.".format(
                     n=self.ice_opts.nfl_reads_per_split),
                     level=logging.INFO)
        self._nfl_splitted_fas = splitFasta(input_fasta=self.nfl_fa,
                                            reads_per_split=self.ice_opts.nfl_reads_per_split,
                                            out_dir=self.nfl_dir,
                                            out_prefix="input.split")
        msg = "Splitted files are: " + "\n".join(self._nfl_splitted_fas)
        self.add_log(msg, level=logging.INFO)

        # Generating dazz DB for final.consensus.fasta
        ref_obj = DazzIDHandler(input_filename=self.final_consensus_fa, converted=False)
        ref_obj.make_db()
        msg = "Dazz DB made for: " + ref_obj.dazz_filename
        self.add_log(msg, level=logging.INFO)

        # Process nfl reads in each splitted fasta.
        self.add_log("Initializing IceAllPartials.", level=logging.INFO)

        self.icep = IceAllPartials(
            root_dir=self.root_dir,
            fasta_filenames=self._nfl_splitted_fas,
            ref_fasta=self.final_consensus_fa,
            out_pickle=self.nfl_all_pickle_fn,
            sge_opts=self.sge_opts,
            ccs_fofn=self.ccs_fofn)
        self.add_log("IceAllPartials log: {f}.".format(f=self.icep.log_fn),
                     level=logging.INFO)
        self.icep.run()
        self.add_log("IceAllPartials completed.", level=logging.INFO)

        self.add_log("Initializing IceQuiver.", level=logging.INFO)
        self.iceq = IceQuiver(root_dir=self.root_dir,
                              bas_fofn=self.bas_fofn,
                              fasta_fofn=self.fasta_fofn,
                              sge_opts=self.sge_opts,
                              tmp_dir=self.tmp_dir)
        self.add_log("IceQuiver log: {f}.".format(f=self.iceq.log_fn),
                     level=logging.INFO)
        self.iceq.run()
        self.add_log("IceQuiver finished.", level=logging.INFO)

        self.add_log("Initializing IceQuiverPostprocess.", level=logging.INFO)
        self.icepq = IceQuiverPostprocess(root_dir=self.root_dir,
                                          use_sge=self.sge_opts.use_sge,
                                          quit_if_not_done=False,
                                          ipq_opts=self.ipq_opts)
        self.add_log("IceQuiverPostprocess log: {f}.".
                     format(f=self.icepq.log_fn), level=logging.INFO)
        self.icepq.run()
        self.add_log("IceQuiverPostprocess finished.", level=logging.INFO)
Exemplo n.º 11
0
def args_runner(args):
    """args runner"""
    logging.info("%s arguments are:\n%s\n", __file__, args)

    # sanity check arguments
    _sanity_check_args(args)

    # make option objects
    ice_opts = IceOptions(quiver=args.quiver,
                          use_finer_qv=args.use_finer_qv,
                          targeted_isoseq=args.targeted_isoseq,
                          ece_penalty=args.ece_penalty,
                          ece_min_len=args.ece_min_len,
                          flnc_reads_per_split=args.flnc_reads_per_split,
                          nfl_reads_per_split=args.nfl_reads_per_split)
    sge_opts = SgeOptions(unique_id=args.unique_id,
                          use_sge=args.use_sge,
                          max_sge_jobs=args.max_sge_jobs,
                          blasr_nproc=args.blasr_nproc,
                          quiver_nproc=args.quiver_nproc,
                          gcon_nproc=args.gcon_nproc,
                          sge_env_name=args.sge_env_name,
                          sge_queue=args.sge_queue)
    ipq_opts = IceQuiverHQLQOptions(
        qv_trim_5=args.qv_trim_5,
        qv_trim_3=args.qv_trim_3,
        hq_quiver_min_accuracy=args.hq_quiver_min_accuracy)

    # (1) separate flnc reads into bins
    logging.info("Separating FLNC reads into bins.")
    tofu_f = TofuFiles(tofu_dir=args.tofu_dir)
    s = SeparateFLNCRunner(flnc_fa=args.flnc_fa,
                           root_dir=args.tofu_dir,
                           out_pickle=tofu_f.separate_flnc_pickle,
                           bin_size_kb=args.bin_size_kb,
                           bin_by_primer=args.bin_by_primer,
                           bin_manual=args.bin_manual,
                           max_base_limit_MB=args.max_base_limit_MB)
    s.run()

    flnc_files = SeparateFLNCBase.convert_pickle_to_sorted_flnc_files(
        tofu_f.separate_flnc_pickle)
    logging.info("Separated FLNC reads bins are %s", flnc_files)

    # (2) apply 'pbtranscript cluster' to each bin
    # run ICE/Quiver (the whole thing), providing the fasta_fofn
    logging.info("Running ICE/Polish on separated FLNC reads bins.")
    split_dirs = []
    for flnc_file in flnc_files:
        split_dir = op.join(realpath(op.dirname(flnc_file)), "cluster_out")
        mkdir(split_dir)
        split_dirs.append(split_dir)
        cur_out_cons = op.join(split_dir, "consensus_isoforms.fasta")

        ipq_f = IceQuiverPostprocess(root_dir=split_dir, ipq_opts=ipq_opts)
        if op.exists(ipq_f.quivered_good_fq):
            logging.warning("HQ polished isoforms %s already exist. SKIP!",
                            ipq_f.quivered_good_fq)
            continue
        else:
            logging.info("Running ICE/Quiver on %s", split_dir)
            rmpath(cur_out_cons)

        obj = Cluster(root_dir=split_dir,
                      flnc_fa=flnc_file,
                      nfl_fa=args.nfl_fa,
                      bas_fofn=args.bas_fofn,
                      ccs_fofn=args.ccs_fofn,
                      fasta_fofn=args.fasta_fofn,
                      out_fa=cur_out_cons,
                      sge_opts=sge_opts,
                      ice_opts=ice_opts,
                      ipq_opts=ipq_opts)

        if args.mem_debug:  # DEBUG
            from memory_profiler import memory_usage
            start_t = time.time()
            mem_usage = memory_usage(obj.run, interval=60)
            end_t = time.time()
            with open('mem_debug.log', 'a') as f:
                f.write("Running ICE/Quiver on {0} took {1} secs.\n".format(
                    split_dir, end_t - start_t))
                f.write("Maximum memory usage: {0}\n".format(max(mem_usage)))
                f.write("Memory usage: {0}\n".format(mem_usage))
        else:
            obj.run()

        if not args.keep_tmp_files:  # by deafult, delete all tempory files.
            logging.info("Deleting %s", ipq_f.tmp_dir)
            subprocess.Popen(['rm', '-rf', '%s' % ipq_f.tmp_dir])
            logging.info("Deleting %s", ipq_f.quivered_dir)
            subprocess.Popen(['rm', '-rf', '%s' % ipq_f.quivered_dir])

    # (3) merge polished isoform cluster from all bins
    logging.info("Merging isoforms from all bins to %s.", tofu_f.combined_dir)
    c = CombineRunner(combined_dir=tofu_f.combined_dir,
                      sample_name=get_sample_name(args.sample_name),
                      split_dirs=split_dirs,
                      ipq_opts=ipq_opts)
    c.run()
    if args.summary_fn is not None:
        ln(tofu_f.all_cluster_summary_fn, args.summary_fn)
    if args.report_fn is not None:
        ln(tofu_f.all_cluster_report_fn, args.report_fn)

    # (4) map HQ isoforms to GMAP reference genome
    map_isoforms_and_sort(input_filename=tofu_f.all_hq_fq,
                          sam_filename=tofu_f.sorted_gmap_sam,
                          gmap_db_dir=args.gmap_db,
                          gmap_db_name=args.gmap_name,
                          gmap_nproc=args.gmap_nproc)

    # (5) post mapping to genome analysis, including
    #     * collapse polished HQ isoform clusters into groups
    #     * count abundance of collapsed isoform groups
    #     * filter collapsed isoforms based on abundance info
    logging.info("Post mapping to genome analysis.")
    out_isoforms = args.collapsed_filtered_fn
    if any(out_isoforms.endswith(ext) for ext in (".fa", ".fasta")):
        in_isoforms = tofu_f.all_hq_fa
    elif any(out_isoforms.endswith(ext) for ext in (".fq", ".fastq")):
        in_isoforms = tofu_f.all_hq_fq
    else:
        raise ValueError("Output file %s must be FASTA or FASTQ!" %
                         out_isoforms)

    post_mapping_to_genome_runner(in_isoforms=in_isoforms,
                                  in_sam=tofu_f.sorted_gmap_sam,
                                  in_pickle=tofu_f.hq_lq_prefix_dict_pickle,
                                  out_isoforms=args.collapsed_filtered_fn,
                                  out_gff=args.gff_fn,
                                  out_abundance=args.abundance_fn,
                                  out_group=args.group_fn,
                                  out_read_stat=args.read_stat_fn,
                                  min_aln_coverage=args.min_aln_coverage,
                                  min_aln_identity=args.min_aln_identity,
                                  min_flnc_coverage=args.min_flnc_coverage,
                                  max_fuzzy_junction=args.max_fuzzy_junction,
                                  allow_extra_5exon=args.allow_extra_5exon,
                                  min_count=args.min_count)

    return 0
Exemplo n.º 12
0
    def run(self):
        """
        For each cluster bin, create summary.json, cluster_report.csv,
        hq_isoforms.fa|fq, lq_isoforms.fa|fq
        Finally, merge all cluster bins and save all outputs to 'combined'.
        """
        logging.info("Running {f} v{v}.".format(f=op.basename(__file__),
                                                v=self.getVersion()))
        args = self.args

        # Get cluster bins directories as input
        cluster_bin_dirs = self.get_cluster_bin_dirs(separate_flnc_pickle=args.separate_flnc_pickle,
                                                     cluster_bin_dirs=args.cluster_bin_dirs)
        cluster_bin_indices = range(0, len(cluster_bin_dirs))

        # Create output dir
        combined_dir = args.combined_dir
        mkdir(combined_dir)

        # Get combined output filenames
        def f(input_fn, default_fn):
            if input_fn is None:
                return op.join(combined_dir, default_fn)

        out_consensus_isoforms_fa = f(args.consensus_isoforms_fa, "all.consensus_isoforms.fasta")
        out_summary = f(args.summary_fn, "all.cluster_summary.json")
        out_report = f(args.report_fn, "all.cluster_report.csv")
        out_hq_fa = f(args.hq_isoforms_fa, "all.polished_hq.fasta")
        out_lq_fa = f(args.lq_isoforms_fa, "all.polished_lq.fasta")
        out_hq_fq = f(args.hq_isoforms_fq, "all.polished_hq.fastq")
        out_lq_fq = f(args.lq_isoforms_fq, "all.polished_lq.fastq")

        ipq_opts = IceQuiverHQLQOptions(qv_trim_5=args.qv_trim_5,
                                        qv_trim_3=args.qv_trim_3,
                                        hq_quiver_min_accuracy=args.hq_quiver_min_accuracy)
        sample_name = get_sample_name(input_sample_name=args.sample_name)


        hq_fq_fns, lq_fq_fns = [], []
        split_uc_pickles, split_partial_uc_pickles = [], []
        split_consensus_isoforms = []

        for cluster_bin_dir in cluster_bin_dirs:
            ice_pq = IceQuiverPostprocess(root_dir=cluster_bin_dir, ipq_opts=ipq_opts)
            hq_fq_fns.append(ice_pq.quivered_good_fq)
            lq_fq_fns.append(ice_pq.quivered_bad_fq)
            split_uc_pickles.append(ice_pq.final_pickle_fn)
            split_partial_uc_pickles.append(ice_pq.nfl_all_pickle_fn)
            split_consensus_isoforms.append(ice_pq.final_consensus_fa)

        combined_files = CombinedFiles(combined_dir)
        log.info("Combining results of all cluster bins to %s.", combined_dir)
        log.info("Merging HQ|LQ isoforms from all cluster bins.")
        log.info("HQ isoforms are: %s.", ",".join(hq_fq_fns))
        log.info("LQ isoforms are: %s.", ",".join(lq_fq_fns))
        combine_polished_isoforms(split_indices=cluster_bin_indices,
                                  split_hq_fns=hq_fq_fns,
                                  split_lq_fns=lq_fq_fns,
                                  combined_hq_fa=combined_files.all_hq_fa,
                                  combined_hq_fq=combined_files.all_hq_fq,
                                  combined_lq_fa=combined_files.all_lq_fa,
                                  combined_lq_fq=combined_files.all_lq_fq,
                                  hq_lq_prefix_dict_pickle=combined_files.hq_lq_prefix_dict_pickle,
                                  sample_name=sample_name)

        ln(combined_files.all_hq_fa, out_hq_fa) #'HQ isoforms'
        ln(combined_files.all_hq_fq, out_hq_fq) #'HQ isoforms'
        ln(combined_files.all_lq_fa, out_lq_fa) #'LQ isoforms'
        ln(combined_files.all_lq_fq, out_lq_fq) #'LQ isoforms'

        log.info("Merging consensus isoforms from all cluster bins.")
        combine_consensus_isoforms(split_indices=cluster_bin_indices,
                                   split_files=split_consensus_isoforms,
                                   combined_consensus_isoforms_fa=combined_files.all_consensus_isoforms_fa,
                                   sample_name=sample_name)
        ln(combined_files.all_consensus_isoforms_fa, out_consensus_isoforms_fa)

        log.info("Writing cluster summary to %s", combined_files.all_cluster_summary_fn)
        write_cluster_summary(summary_fn=combined_files.all_cluster_summary_fn,
                              isoforms_fa=out_consensus_isoforms_fa,
                              hq_fa=out_hq_fa, lq_fa=out_lq_fa)
        ln(combined_files.all_cluster_summary_fn, out_summary) # "cluster summary"

        log.info("Writing cluster report to %s", combined_files.all_cluster_report_fn)
        write_combined_cluster_report(split_indices=cluster_bin_indices,
                                      split_uc_pickles=split_uc_pickles,
                                      split_partial_uc_pickles=split_partial_uc_pickles,
                                      report_fn=combined_files.all_cluster_report_fn,
                                      sample_name=sample_name)
        ln(combined_files.all_cluster_report_fn, out_report) # "cluster report"