Exemplo n.º 1
0
    def create_sams_for_clusters_in_bin(self, cids, refs):
        """
        Create sam files for clusters in cids.
        For each cluster k in cids,
        * Call blasr to align its associated subreads in raw_fa to its consensus
          sequence as reference.

        cids --- cluster ids
        refs --- refs[k] -> consensus seq of cluster k

        This function has to be called after prepare_raw_fa_for_clusters is done.

        """
        for k in cids:  # for each cluster k

            # $root_dir/tmp/?/c{k}/in.raw_with_partial.fa
            raw_fa = self.raw_fa_of_cluster(k)

            if not op.exists(raw_fa):
                raise IOError("raw_fa {f} does not exist. ".format(f=raw_fa) +
                              "Please check raw fasta of this bin is created.")

            blasr_sam_for_quiver(
                input_fasta=raw_fa,
                ref_fasta=refs[k],
                out_sam_filename=self.sam_of_cluster(k),
                run_cmd=True,
                blasr_nproc=self.sge_opts.blasr_nproc)
Exemplo n.º 2
0
    def create_sams_for_clusters_in_bin(self, cids, refs):
        """
        Create sam files for clusters in cids.
        For each cluster k in cids,
        * Call blasr to align its associated subreads in raw_fa to its consensus
          sequence as reference.

        cids --- cluster ids
        refs --- refs[k] -> consensus seq of cluster k

        This function has to be called after prepare_raw_fa_for_clusters is done.

        """
        for k in cids:  # for each cluster k

            # $root_dir/tmp/?/c{k}/in.raw_with_partial.fa
            raw_fa = self.raw_fa_of_cluster(k)

            if not op.exists(raw_fa):
                raise IOError("raw_fa {f} does not exist. ".format(f=raw_fa) +
                              "Please check raw fasta of this bin is created.")

            blasr_sam_for_quiver(input_fasta=raw_fa,
                                 ref_fasta=refs[k],
                                 out_sam_filename=self.sam_of_cluster(k),
                                 run_cmd=True,
                                 blasr_nproc=self.sge_opts.blasr_nproc)
Exemplo n.º 3
0
    def submit_quiver_jobs(self, d, uc, partial_uc, refs, keys, start, end,
                           submitted, todo, use_sge, max_sge_jobs,
                           quiver_nproc):
        """Call quiver to polish consensus.
        (1) for each cluster k, obtain unrolled sequences of all reads (zmws)
            belonging to this cluster, and save in raw_fa_of_cluster(k)
        (2) for each cluster k, call blasr to align raw_f_of_cluster to
            consensus sequence of the cluster and create sam_of_cluster.

        (3) Put every 100 clusters into one big bin, and then
            merge all sam_of_cluster files to sam_of_quivered_bin
        (4) Prepare commands including
                samtoh5, loadPulses, cmph5tools.py ...
            in order to convert sam_of_quivered_bin to cmph5_of_quivered_bin.
                * Either write these command to script_of_quivered_bin and qsub
                  all jobs later when scripts of all quivered bins are done,
                * Or execute the commands immediately.
        """
        for i in xrange(start, end, 100):
            for k in keys[i:min(end, i + 100)]:
                #os.chdir(op.join('./tmp', str(k/10000), 'c'+str(k)))
                raw_fa = self.raw_fa_of_cluster(k)

                # write_in_raw_fa return movies of reads in partial_uc
                # logging.debug("uc[k]={0}".format(uc[k]))
                # logging.debug("partial_uc[k]={0}".format(uc[k]))
                write_in_raw_fasta(input_fasta_d=d,
                                   in_seqids=uc[k] + partial_uc[k],
                                   out_fa=raw_fa,
                                   ignore_keyerror=True)

                #TODO: use multi-processing pool, reduce nproc
                blasr_sam_for_quiver(input_fasta=raw_fa,
                                     ref_fasta=refs[k],
                                     out_sam_filename=self.sam_of_cluster(k),
                                     run_cmd=True)

            fname = self.setup_quiver_for_batch(cids=keys[i:min(end, i + 100)],
                                                refs=refs,
                                                quiver_nproc=quiver_nproc,
                                                return_script=True)
            todo.append(fname)

            if use_sge is not True or \
               max_sge_jobs == 0: # don't use SGE
                for job in todo:
                    elog = op.join(self.quivered_log_dir,
                                   op.basename(job) + ".elog")
                    olog = op.join(self.quivered_log_dir,
                                   op.basename(job) + ".olog")
                    msg = "Running quiver job locally: {j} ".format(j=job) + \
                          "1>{olog} 2>{elog}".format(olog=olog, elog=elog)
                    self.add_log(msg)
                    cmd = "bash " + job + " 1>{olog} 2>{elog}".\
                          format(olog=olog, elog=elog)
                    _out, _code, _msg = backticks(cmd)
                    if _code != 0:
                        errMsg = "Failed to run quiver {j}".format(
                            j=job) + _msg
                        self.add_log(errMsg, level=logging.ERROR)
                        raise RuntimeError(errMsg)
                    submitted.append(("local", job))
                todo = []
            else:
                while len(todo) > 0:
                    n = min(max_sge_jobs, len(todo))
                    for job in todo[:n]:
                        # ex: Your job 8613116 ("c20to70.sh") has been submitted
                        elog = op.join(self.quivered_log_dir,
                                       op.basename(job) + ".elog")
                        olog = op.join(self.quivered_log_dir,
                                       op.basename(job) + ".olog")
                        qsub_cmd = "qsub " + \
                                   "-pe smp {n} ".format(n=quiver_nproc) + \
                                   "-cwd -S /bin/bash -V " + \
                                   "-e {elog} ".format(elog=elog) + \
                                   "-o {olog} ".format(olog=olog) + \
                                   "{job}".format(job=job)
                        msg = "Submitting CMD: {cmd}.\n".format(cmd=qsub_cmd)
                        self.add_log(msg)
                        _out, _code, _msg = backticks(qsub_cmd)
                        if _code != 0:
                            errMsg = "Failed to submit CMD {cmd}.".format(
                                cmd=qsub_cmd)
                            self.add_log(errMsg, level=logging.ERROR)
                            raise RuntimeError(errMsg)

                        job_id = str(_out).split()[2]
                        submitted.append((job_id, job))
                        todo.remove(job)
Exemplo n.º 4
0
    def submit_quiver_jobs(self, d, uc, partial_uc, refs, keys, start, end,
                           submitted, todo,
                           use_sge, max_sge_jobs, quiver_nproc):
        """Call quiver to polish consensus.
        (1) for each cluster k, obtain unrolled sequences of all reads (zmws)
            belonging to this cluster, and save in raw_fa_of_cluster(k)
        (2) for each cluster k, call blasr to align raw_f_of_cluster to
            consensus sequence of the cluster and create sam_of_cluster.

        (3) Put every 100 clusters into one big bin, and then
            merge all sam_of_cluster files to sam_of_quivered_bin
        (4) Prepare commands including
                samtoh5, loadPulses, cmph5tools.py ...
            in order to convert sam_of_quivered_bin to cmph5_of_quivered_bin.
                * Either write these command to script_of_quivered_bin and qsub
                  all jobs later when scripts of all quivered bins are done,
                * Or execute the commands immediately.
        """
        for i in xrange(start, end, 100):
            for k in keys[i: min(end, i+100)]:
                #os.chdir(op.join('./tmp', str(k/10000), 'c'+str(k)))
                raw_fa = self.raw_fa_of_cluster(k)

                # write_in_raw_fa return movies of reads in partial_uc
                # logging.debug("uc[k]={0}".format(uc[k]))
                # logging.debug("partial_uc[k]={0}".format(uc[k]))
                write_in_raw_fasta(input_fasta_d=d,
                    in_seqids=uc[k] + partial_uc[k],
                    out_fa=raw_fa,
                    ignore_keyerror=True)

                #TODO: use multi-processing pool, reduce nproc
                blasr_sam_for_quiver(
                    input_fasta=raw_fa,
                    ref_fasta=refs[k],
                    out_sam_filename=self.sam_of_cluster(k),
                    run_cmd=True)

            fname = self.setup_quiver_for_batch(cids=keys[i: min(end, i+100)],
                        refs=refs, quiver_nproc=quiver_nproc,
                        return_script=True)
            todo.append(fname)

            if use_sge is not True or \
               max_sge_jobs == 0: # don't use SGE
                for job in todo:
                    elog = op.join(self.quivered_log_dir,
                                   op.basename(job) + ".elog")
                    olog = op.join(self.quivered_log_dir,
                                   op.basename(job) + ".olog")
                    msg = "Running quiver job locally: {j} ".format(j=job) + \
                          "1>{olog} 2>{elog}".format(olog=olog, elog=elog)
                    self.add_log(msg)
                    cmd = "bash " + job + " 1>{olog} 2>{elog}".\
                          format(olog=olog, elog=elog)
                    _out, _code, _msg = backticks(cmd)
                    if _code != 0:
                        errMsg = "Failed to run quiver {j}".format(j=job) + _msg
                        self.add_log(errMsg, level=logging.ERROR)
                        raise RuntimeError(errMsg)
                    submitted.append(("local", job))
                todo = []
            else:
                while len(todo) > 0:
                    n = min(max_sge_jobs, len(todo))
                    for job in todo[:n]:
                        # ex: Your job 8613116 ("c20to70.sh") has been submitted
                        elog = op.join(self.quivered_log_dir,
                                       op.basename(job) + ".elog")
                        olog = op.join(self.quivered_log_dir,
                                       op.basename(job) + ".olog")
                        qsub_cmd = "qsub " + \
                                   "-pe smp {n} ".format(n=quiver_nproc) + \
                                   "-cwd -S /bin/bash -V " + \
                                   "-e {elog} ".format(elog=elog) + \
                                   "-o {olog} ".format(olog=olog) + \
                                   "{job}".format(job=job)
                        msg = "Submitting CMD: {cmd}.\n".format(cmd=qsub_cmd)
                        self.add_log(msg)
                        _out, _code, _msg = backticks(qsub_cmd)
                        if _code != 0:
                            errMsg = "Failed to submit CMD {cmd}.".format(
                                    cmd=qsub_cmd)
                            self.add_log(errMsg, level=logging.ERROR)
                            raise RuntimeError(errMsg)

                        job_id = str(_out).split()[2]
                        submitted.append((job_id, job))
                        todo.remove(job)