示例#1
0
    def _validate_inputs(self, root_dir, N):
        """
        Check inputs, return
        (splitted_pickles, out_pickle)
        """
        icef = IceFiles(prog_name="ice_partial_merge",
                        root_dir=root_dir, no_log_f=False)

        # root_dir/output/map_noFL/input.split_{0:03d}.fasta.partial_uc.pickle
        splitted_pickles = [icef.nfl_pickle_i(i) for i in range(0, N)]
        dones = [icef.nfl_done_i(i) for i in range(0, N)]

        # Check if inputs exist.
        errMsg = ""
        for done in dones:
            if not nfs_exists(done):
                errMsg = "DONE file {f} does not exist.".format(f=done)
        for pickle in splitted_pickles:
            if not nfs_exists(pickle):
                errMsg = "Pickle file {f} does not exist.".format(f=pickle)

        if len(errMsg) != 0:
            raise ValueError(errMsg)

        # root_dir/output/map_noFL/nfl.all.partial_uc.pickle
        out_pickle = icef.nfl_all_pickle_fn
        return (splitted_pickles, out_pickle)
示例#2
0
    def _validate_inputs(self, root_dir, i, ccs_fofn, blasr_nproc, tmp_dir):
        """
        Check inputs, write $ICE_PARTIAL_PY i command to script_file
        and return (input_fasta, ref_fasta, out_pickle, done_file)
        for the i-th chunk of nfl reads.
        """
        icef = IceFiles(prog_name="ice_partial_{i}".format(i=i),
                        root_dir=root_dir,
                        no_log_f=False)

        # root_dir/output/final.consensus.fasta
        ref_fasta = icef.final_consensus_fa
        ref_dazz = icef.final_dazz_db

        # root_dir/output/map_noFL/input.split_{0:03d}.fasta
        input_fasta = icef.nfl_fa_i(i)

        # $input_fasta.partial_uc.pickle
        out_pickle = icef.nfl_pickle_i(i)

        # $input_fasta.partial_uc.pickle.DONE
        done_file = icef.nfl_done_i(i)

        # $input_fasta.partial_uc.sh
        script_file = icef.nfl_script_i(i)

        # Check if inputs exist.
        errMsg = ""
        if not nfs_exists(input_fasta):
            errMsg = (
                "The {i}-th splitted non-full-length reads ".format(i=i) +
                "fasta file {f} does not exist. ".format(f=input_fasta) +
                "Please run $ICE_PARTIAL_PY split first.")
        elif not nfs_exists(ref_fasta):
            errMsg = ("The unpolished consensus isoforms fasta file " +
                      "{f} does not exist. ".format(f=ref_fasta) +
                      "Please make sure ICE is successfully done.")
        elif not nfs_exists(ref_dazz):
            errMsg = ("The dazz db " +
                      "{f} does not exist. ".format(f=ref_dazz) +
                      "Please make sure it is already built.")
        if len(errMsg) != 0:
            raise ValueError(errMsg)

        # Save cmd to script_file.
        cmd = self._cmd_str(root_dir=root_dir,
                            i=[i],
                            ccs_fofn=ccs_fofn,
                            blasr_nproc=blasr_nproc,
                            tmp_dir=tmp_dir)
        with open(script_file, 'w') as writer:
            writer.write(cmd + "\n")

        icef.add_log(
            "Writing CMD to: {script_file}".format(script_file=script_file))
        icef.close_log()

        return (input_fasta, ref_fasta, out_pickle, done_file)
示例#3
0
    def _validate_inputs(self, root_dir, i, ccs_fofn, blasr_nproc, tmp_dir):
        """
        Check inputs, write $ICE_PARTIAL_PY i command to script_file
        and return (input_fasta, ref_fasta, out_pickle, done_file)
        for the i-th chunk of nfl reads.
        """
        icef = IceFiles(prog_name="ice_partial_{i}".format(i=i),
                        root_dir=root_dir, no_log_f=False)

        # root_dir/output/final.consensus.fasta
        ref_fasta = icef.final_consensus_fa
        ref_dazz = icef.final_dazz_db

        # root_dir/output/map_noFL/input.split_{0:03d}.fasta
        input_fasta = icef.nfl_fa_i(i)

        # $input_fasta.partial_uc.pickle
        out_pickle = icef.nfl_pickle_i(i)

        # $input_fasta.partial_uc.pickle.DONE
        done_file = icef.nfl_done_i(i)

        # $input_fasta.partial_uc.sh
        script_file = icef.nfl_script_i(i)

        # Check if inputs exist.
        errMsg = ""
        if not nfs_exists(input_fasta):
            errMsg = ("The {i}-th splitted non-full-length reads ".format(i=i) +
                      "fasta file {f} does not exist. ".format(f=input_fasta) +
                      "Please run $ICE_PARTIAL_PY split first.")
        elif not nfs_exists(ref_fasta):
            errMsg = ("The unpolished consensus isoforms fasta file " +
                      "{f} does not exist. ".format(f=ref_fasta) +
                      "Please make sure ICE is successfully done.")
        elif not nfs_exists(ref_dazz):
            errMsg = ("The dazz db " +
                      "{f} does not exist. ".format(f=ref_dazz) +
                      "Please make sure it is already built.")
        if len(errMsg) != 0:
            raise ValueError(errMsg)

        # Save cmd to script_file.
        cmd = self._cmd_str(root_dir=root_dir, i=[i],
                            ccs_fofn=ccs_fofn,
                            blasr_nproc=blasr_nproc,
                            tmp_dir=tmp_dir)
        with open(script_file, 'w') as writer:
            writer.write(cmd + "\n")

        icef.add_log("Writing CMD to: {script_file}".
                     format(script_file=script_file))
        icef.close_log()

        return (input_fasta, ref_fasta, out_pickle, done_file)
示例#4
0
    def create_quiver_bins_and_submit_jobs(self, d, uc, partial_uc, refs, keys,
                                           start, end, submitted, sge_opts):
        """
        Put every 100 clusters together and create bins. Create a bash script
        (e.g., script_of_quivered_bin), for each bin, and submit the script
        either using qsub or running it locally.
        return all bash scripts in a list.
        """
        if start >= end or start < 0 or start > len(keys) or end > len(keys):
            return []

        # Update refs
        new_refs = {cid: op.join(self.cluster_dir(cid), op.basename(refs[cid])) for cid in keys[start:end]}
        refs = new_refs

        # Reconstruct refs if not exist.
        if not nfs_exists(refs[keys[start]]):
            self.reconstruct_ref_fa_for_clusters_in_bin(cids=keys[start:end],
                                                        refs=refs)

        all_todo = []
        for i in xrange(start, end, 100):  # Put every 100 clusters to a bin
            cids = keys[i:min(end, i + 100)]
            bin_sh = self.create_a_quiver_bin(cids=cids, d=d, uc=uc,
                                              partial_uc=partial_uc,
                                              refs=refs, sge_opts=sge_opts)
            all_todo.append(bin_sh)
            # assert bin_sh == self.script_of_quivered_bin(first, last)
            # submit the created script of this quiver bin
            self.submit_todo_quiver_jobs(todo=[bin_sh], submitted=submitted,
                                         sge_opts=sge_opts)
        # end of for i in xrange(start, end, 100):
        return all_todo
示例#5
0
    def run(self):
        """Run"""
        iceq = IceQuiver(root_dir=self.root_dir, bas_fofn=None,
                         fasta_fofn=None, sge_opts=None,
                         prog_name="ice_quiver_merge")

        iceq.add_log(self.cmd_str())
        iceq.add_log("root_dir: {d}.".format(d=self.root_dir))
        iceq.add_log("Total number of chunks: N = {N}.".format(N=self.N))

        src = [iceq.submitted_quiver_jobs_log_of_chunk_i(i=i, num_chunks=self.N)
               for i in range(0, self.N)]
        for f in src:
            if not nfs_exists(f):
                raise IOError("Log {f} ".format(f=f) +
                              "of submitted quiver jobs does not exist.")

        dst = iceq.submitted_quiver_jobs_log

        iceq.add_log("Collecting submitted quiver jobs from:\n{src}\nto {dst}.".
                     format(src="\n".join(src), dst=dst))

        cat_files(src=src, dst=dst)

        iceq.close_log()
示例#6
0
    def run(self):
        """Run"""
        iceq = IceQuiver(root_dir=self.root_dir, bas_fofn=None,
                         fasta_fofn=None, sge_opts=None,
                         prog_name="ice_quiver_merge")

        iceq.add_log(self.cmd_str())
        iceq.add_log("root_dir: {d}.".format(d=self.root_dir))
        iceq.add_log("Total number of chunks: N = {N}.".format(N=self.N))

        src = [iceq.submitted_quiver_jobs_log_of_chunk_i(i=i, num_chunks=self.N)
               for i in range(0, self.N)]
        for f in src:
            if not nfs_exists(f):
                raise IOError("Log {f} ".format(f=f) +
                              "of submitted quiver jobs does not exist.")

        dst = iceq.submitted_quiver_jobs_log

        iceq.add_log("Collecting submitted quiver jobs from:\n{src}\nto {dst}.".
                     format(src="\n".join(src), dst=dst))

        cat_files(src=src, dst=dst)

        iceq.close_log()
示例#7
0
    def reconstruct_ref_fa_for_clusters_in_bin(self, cids, refs):
        """
        Reconstruct ref_fa of the cluster in the new tmp_dir
        e.g.,
            self.g_consensus_ref_fa_of_cluster(cid)

        cids --- list[int(cid)], e.g., [10, 11, 12, ..., 20]
        refs --- dict{int(cid): ref_fa of cluster(cid)}
        """
        # Check existence when first time it is read.
        if not nfs_exists(self.final_consensus_fa):
            raise IOError("Final consensus FASTA file {f}".format(
                f=self.final_consensus_fa) + "does not exist.")

        self.add_log("Reconstructing g consensus files for clusters "
                     "[%d, %d] in %s" % (cids[0], cids[-1], self.tmp_dir),
                     level=logging.INFO)

        final_consensus_d = FastaRandomReader(self.final_consensus_fa)
        for ref_id in final_consensus_d.d.keys():
            cid = int(ref_id.split('/')[0].replace('c', ''))
            # e.g., ref_id = c103/1/3708, cid = 103,
            #       refs[cid] = ...tmp/0/c103/g_consensus_ref.fasta
            if cid in cids:
                mkdir(self.cluster_dir(cid))
                ref_fa = op.join(self.cluster_dir(cid), op.basename(refs[cid]))
                refs[cid] = ref_fa
                with FastaWriter(ref_fa) as writer:
                    self.add_log("Writing ref_fa %s" % refs[cid])
                    writer.writeRecord(ref_id,
                                       final_consensus_d[ref_id].sequence[:])

        self.add_log("Reconstruct of g consensus files completed.",
                     level=logging.INFO)
示例#8
0
    def create_quiver_bins_and_submit_jobs(self, d, uc, partial_uc, refs, keys,
                                           start, end, submitted, sge_opts):
        """
        Put every 100 clusters together and create bins. Create a bash script
        (e.g., script_of_quivered_bin), for each bin, and submit the script
        either using qsub or running it locally.
        return all bash scripts in a list.
        """
        if start >= end or start < 0 or start > len(keys) or end > len(keys):
            return []

        # Update refs
        new_refs = {cid: op.join(self.cluster_dir(cid), op.basename(refs[cid])) for cid in keys[start:end]}
        refs = new_refs

        # Reconstruct refs if not exist.
        if not nfs_exists(refs[keys[start]]):
            self.reconstruct_ref_fa_for_clusters_in_bin(cids=keys[start:end],
                                                        refs=refs)

        all_todo = []
        for i in xrange(start, end, 100):  # Put every 100 clusters to a bin
            cids = keys[i:min(end, i + 100)]
            bin_sh = self.create_a_quiver_bin(cids=cids, d=d, uc=uc,
                                              partial_uc=partial_uc,
                                              refs=refs, sge_opts=sge_opts)
            all_todo.append(bin_sh)
            # assert bin_sh == self.script_of_quivered_bin(first, last)
            # submit the created script of this quiver bin
            self.submit_todo_quiver_jobs(todo=[bin_sh], submitted=submitted,
                                         sge_opts=sge_opts)
        # end of for i in xrange(start, end, 100):
        return all_todo
示例#9
0
    def reconstruct_ref_fa_for_clusters_in_bin(self, cids, refs):
        """
        Reconstruct ref_fa of the cluster in the new tmp_dir
        e.g.,
            self.g_consensus_ref_fa_of_cluster(cid)

        cids --- list[int(cid)], e.g., [10, 11, 12, ..., 20]
        refs --- dict{int(cid): ref_fa of cluster(cid)}
        """
        # Check existence when first time it is read.
        if not nfs_exists(self.final_consensus_fa):
            raise IOError("Final consensus FASTA file {f}".format(
                f=self.final_consensus_fa) + "does not exist.")

        self.add_log("Reconstructing g consensus files for clusters "
                     "[%d, %d] in %s" % (cids[0], cids[-1], self.tmp_dir),
                     level=logging.INFO)

        final_consensus_d = FastaRandomReader(self.final_consensus_fa)
        for ref_id in final_consensus_d.d.keys():
            cid = int(ref_id.split('/')[0].replace('c', ''))
            # e.g., ref_id = c103/1/3708, cid = 103,
            #       refs[cid] = ...tmp/0/c103/g_consensus_ref.fasta
            if cid in cids:
                mkdir(self.cluster_dir(cid))
                ref_fa = op.join(self.cluster_dir(cid),
                                 op.basename(refs[cid]))
                refs[cid] = ref_fa
                with FastaWriter(ref_fa) as writer:
                    self.add_log("Writing ref_fa %s" % refs[cid])
                    writer.writeRecord(ref_id,
                                       final_consensus_d[ref_id].sequence[:])

        self.add_log("Reconstruct of g consensus files completed.",
                     level=logging.INFO)
示例#10
0
    def _validate_inputs(self, root_dir, nfl_fa, N):
        """
        Check inputs, return
        (num_reads,
         number_reads_per_chunk,
         nfl_dir,
         [i-th_chunk_nfl_fa for i in [0...N-1]])
        """
        icef = IceFiles(prog_name="ice_partial_split",
                        root_dir=root_dir,
                        no_log_f=False)

        nfl_dir = icef.nfl_dir

        # root_dir/output/map_noFL/input.split_{0:03d}.fasta
        splitted_nfl_fas = [icef.nfl_fa_i(i) for i in range(0, N)]

        mkdir(icef.nfl_dir)

        # Check if inputs exist.
        errMsg = ""
        if not nfs_exists(nfl_fa):
            errMsg = ("The input non-full-length reads fasta file " +
                      "{f} does not exists. ".format(f=nfl_fa))
        if len(errMsg) != 0:
            raise ValueError(errMsg)

        num_reads = num_reads_in_fasta(nfl_fa)
        reads_per_split = int(max(1, ceil(num_reads * 1.0 / N)))

        return (num_reads, reads_per_split, nfl_dir, splitted_nfl_fas)
示例#11
0
    def __init__(self, input_filename, converted=False, dazz_dir=None):
        """
        input_filename - input FASTA/FASTQ/ContigSet file
        converted - whether or not input file has been converted to
                    daligner compatible FASTA file.
        dazz_dir - if None, save all dazz.fasta, dazz.pickle, db files
                  in the same directory as inputfile.
                  if a valid path, save all output files to dazz_dir.
        """
        self.dazz_dir = dazz_dir
        self.input_filename = realpath(input_filename)
        self.validate_file_type(self.input_filename)

        # index --> original sequence ID ex: 1 --> movie/zmw/start_end_CCS
        self.dazz_mapping = {}

        if converted and not nfs_exists(self.db_filename):
            log.warning(
                str(self.input_filename) +
                " should have been converted to daligner-compatible" +
                " format, but in fact it is not. Converting ...")
            converted = False

        if not converted:
            self.convert_to_dazz_fasta()
            self.make_db()
        else:
            self.read_dazz_pickle()
示例#12
0
    def _validate_inputs(self, root_dir, nfl_fa, N):
        """
        Check inputs, return
        (num_reads,
         number_reads_per_chunk,
         nfl_dir,
         [i-th_chunk_nfl_fa for i in [0...N-1]])
        """
        icef = IceFiles(prog_name="ice_partial_split",
                        root_dir=root_dir, no_log_f=False)

        nfl_dir = icef.nfl_dir

        # root_dir/output/map_noFL/input.split_{0:03d}.fasta
        splitted_nfl_fas = [icef.nfl_fa_i(i) for i in range(0, N)]

        mkdir(icef.nfl_dir)

        # Check if inputs exist.
        errMsg = ""
        if N <= 0 or N > 100:
            errMsg = "Input file can not be splitted into %d chunks!" % N

        if not nfs_exists(nfl_fa):
            errMsg = ("The input non-full-length reads fasta file " +
                      "{f} does not exists. ".format(f=nfl_fa))
        if len(errMsg) != 0:
            raise ValueError(errMsg)

        num_reads = num_reads_in_fasta(nfl_fa)
        reads_per_split = int(max(1, ceil(num_reads * 1.0 / N)))

        return (num_reads, reads_per_split, nfl_dir, splitted_nfl_fas)
示例#13
0
    def __init__(self, input_filename, converted=False, dazz_dir=None):
        """
        input_filename - input FASTA/FASTQ/ContigSet file
        converted - whether or not input file has been converted to
                    daligner compatible FASTA file.
        dazz_dir - if None, save all dazz.fasta, dazz.pickle, db files
                  in the same directory as inputfile.
                  if a valid path, save all output files to dazz_dir.
        """
        self.dazz_dir = dazz_dir
        self.input_filename = realpath(input_filename)
        self.validate_file_type(self.input_filename)

        # index --> original sequence ID ex: 1 --> movie/zmw/start_end_CCS
        self.dazz_mapping = {}

        if converted and not nfs_exists(self.db_filename):
            log.warning(str(self.input_filename) +
                        " should have been converted to daligner-compatible" +
                        " format, but in fact it is not. Converting ...")
            converted = False

        if not converted:
            self.convert_to_dazz_fasta()
            self.make_db()
        else:
            self.read_dazz_pickle()
    def validate_inputs(self):
        """Validate if logs and pickle for non-full-length reads exist."""
        errMsg = ""

        if not nfs_exists(self.nfl_all_pickle_fn):
            errMsg = "Pickle file {f} ".format(f=self.nfl_all_pickle_fn) + \
                     "which assigns all non-full-length reads to isoforms " + \
                     "does not exist. Please check 'ice_partial.py *' are " + \
                     "all done."
        elif not nfs_exists(self.final_pickle_fn):
            errMsg = "Pickle file {f} ".format(f=self.final_pickle_fn) + \
                     "which assigns full-length non-chimeric reads to " + \
                     "isoforms does not exist."
        elif not nfs_exists(self.submitted_quiver_jobs_log):
            errMsg = "Log file {f}".format(f=self.submitted_quiver_jobs_log) + \
                     " of all submitted quiver jobs {f} does not exist."

        if errMsg != "":
            self.add_log(errMsg, level=logging.ERROR)
            raise IOError(errMsg)
示例#15
0
    def validate_inputs(self):
        """Validate if logs and pickle for non-full-length reads exist."""
        errMsg = ""

        if not nfs_exists(self.nfl_all_pickle_fn):
            errMsg = "Pickle file {f} ".format(f=self.nfl_all_pickle_fn) + \
                     "which assigns all non-full-length reads to isoforms " + \
                     "does not exist. Please check 'ice_partial.py *' are " + \
                     "all done."
        elif not nfs_exists(self.final_pickle_fn):
            errMsg = "Pickle file {f} ".format(f=self.final_pickle_fn) + \
                     "which assigns full-length non-chimeric reads to " + \
                     "isoforms does not exist."
        elif not nfs_exists(self.arrow_submission_run_file):
            errMsg = "Log file {f}".format(f=self.arrow_submission_run_file) + \
                     " of all submitted arrow jobs does not exist."

        if errMsg != "":
            self.add_log(errMsg, level=logging.ERROR)
            raise IOError(errMsg)
示例#16
0
    def validate_inputs(self):
        """Validate input fofns, and root_dir, log_dir, tmp_dir,
        create arrowed_dir and arrowed_log_dir"""
        self.add_log("Validating inputs.")

        # Create directories: root_dir/quivered and root_dir/log_dir/quivered
        try:
            mkdir(self.arrowed_dir)
            mkdir(self.arrowed_log_dir)
        except OSError:
            # Multiple ice_arrow_i jobs may run at the same time and try to
            # mkdir, race condition may happen, so ignore OSError here.
            pass

        errMsg = ""

        if not nfs_exists(self.log_dir) or not op.isdir(self.log_dir):
            errMsg = "Log dir {l} is not an existing directory.".\
                format(l=self.log_dir)
        elif self.subread_xml is None:
            errMsg = "Please specify subreads XML (e.g., --subread_xml=<movie>.subreadset.xml)."
        elif not nfs_exists(self.subread_xml):
            errMsg = "Specified subreads file (subread_xml={f}) does not exist.".format(
                f=self.subread_xml)
        elif guess_file_format(self.subread_xml) is not FILE_FORMATS.BAM:
            errMsg = "Invalid subreads XML file: {0}!".format(self.subread_xml)
        elif not nfs_exists(self.nfl_all_pickle_fn):
            #"output/map_noFL/noFL.ALL.partial_uc.pickle"):
            errMsg = "Pickle file {f} ".format(f=self.nfl_all_pickle_fn) + \
                     "which assigns all non-full-length reads to isoforms " + \
                     "does not exist. Please check 'run_IcePartials2.py *' are " + \
                     "all done."
        elif not nfs_exists(self.final_pickle_fn):
            errMsg = "Pickle file {f} ".format(f=self.final_pickle_fn) + \
                     "which assigns full-length non-chimeric reads to " + \
                     "isoforms does not exist."

        if errMsg != "":
            self.add_log(errMsg, level=logging.ERROR)
            raise IOError(errMsg)
示例#17
0
    def validate_inputs(self):
        """Validate input fofns, and root_dir, log_dir, tmp_dir,
        create quivered_dir and quivered_log_dir"""
        self.add_log("Validating inputs.")

        # Create directories: root_dir/quivered and root_dir/log_dir/quivered
        try:
            mkdir(self.quivered_dir)
            mkdir(self.quivered_log_dir)
        except OSError:
            # Multiple ice_quiver_i jobs may run at the same time and try to
            # mkdir, race condition may happen, so ignore OSError here.
            pass

        errMsg = ""

        if not nfs_exists(self.log_dir) or not op.isdir(self.log_dir):
            errMsg = "Log dir {l} is not an existing directory.".\
                format(l=self.log_dir)
        elif self.bas_fofn is None:
            errMsg = "Please specify subreads file (e.g., --bas_fofn=input.fofn|subreadset.xml)."
        elif not nfs_exists(self.bas_fofn):
            errMsg = "Specified subreads file (bas_fofn={f}) does not exist.".format(
                f=self.bas_fofn)
        elif not nfs_exists(self.nfl_all_pickle_fn):
            #"output/map_noFL/noFL.ALL.partial_uc.pickle"):
            errMsg = "Pickle file {f} ".format(f=self.nfl_all_pickle_fn) + \
                     "which assigns all non-full-length reads to isoforms " + \
                     "does not exist. Please check 'ice_partial.py *' are " + \
                     "all done."
        elif not nfs_exists(self.final_pickle_fn):
            errMsg = "Pickle file {f} ".format(f=self.final_pickle_fn) + \
                     "which assigns full-length non-chimeric reads to " + \
                     "isoforms does not exist."

        if self.bas_fofn is not None and \
            guess_file_format(self.bas_fofn) is not FILE_FORMATS.BAM:
            # No need to convert subreads.bam to fasta
            if self.fasta_fofn is None:
                errMsg = "Please make sure ice_make_fasta_fofn has " + \
                         "been called, and specify fasta_fofn."
            elif not nfs_exists(self.fasta_fofn):
                errMsg = "Input fasta_fofn {f} does not exists.".\
                         format(f=self.fasta_fofn)
                fasta_files = get_files_from_file_or_fofn(self.fasta_fofn)
                for fasta_file in fasta_files:
                    if not nfs_exists(fasta_file):
                        errMsg = "A file {f} in fasta_fofn does not exist.".\
                                 format(f=fasta_file)

        if errMsg != "":
            self.add_log(errMsg, level=logging.ERROR)
            raise IOError(errMsg)
示例#18
0
    def validate_inputs(self):
        """Validate input fofns, and root_dir, log_dir, tmp_dir,
        create quivered_dir and quivered_log_dir"""
        self.add_log("Validating inputs.")

        # Create directories: root_dir/quivered and root_dir/log_dir/quivered
        try:
            mkdir(self.quivered_dir)
            mkdir(self.quivered_log_dir)
        except OSError:
            # Multiple ice_quiver_i jobs may run at the same time and try to
            # mkdir, race condition may happen, so ignore OSError here.
            pass

        errMsg = ""

        if not nfs_exists(self.log_dir) or not op.isdir(self.log_dir):
            errMsg = "Log dir {l} is not an existing directory.".\
                format(l=self.log_dir)
        elif self.bas_fofn is None:
            errMsg = "Please specify subreads file (e.g., --bas_fofn=input.fofn|subreadset.xml)."
        elif not nfs_exists(self.bas_fofn):
            errMsg = "Specified subreads file (bas_fofn={f}) does not exist.".format(f=self.bas_fofn)
        elif not nfs_exists(self.nfl_all_pickle_fn):
            #"output/map_noFL/noFL.ALL.partial_uc.pickle"):
            errMsg = "Pickle file {f} ".format(f=self.nfl_all_pickle_fn) + \
                     "which assigns all non-full-length reads to isoforms " + \
                     "does not exist. Please check 'ice_partial.py *' are " + \
                     "all done."
        elif not nfs_exists(self.final_pickle_fn):
            errMsg = "Pickle file {f} ".format(f=self.final_pickle_fn) + \
                     "which assigns full-length non-chimeric reads to " + \
                     "isoforms does not exist."

        if self.bas_fofn is not None and \
            guess_file_format(self.bas_fofn) is not FILE_FORMATS.BAM:
            # No need to convert subreads.bam to fasta
            if self.fasta_fofn is None:
                errMsg = "Please make sure ice_make_fasta_fofn has " + \
                         "been called, and specify fasta_fofn."
            elif not nfs_exists(self.fasta_fofn):
                errMsg = "Input fasta_fofn {f} does not exists.".\
                         format(f=self.fasta_fofn)
                fasta_files = get_files_from_file_or_fofn(self.fasta_fofn)
                for fasta_file in fasta_files:
                    if not nfs_exists(fasta_file):
                        errMsg = "A file {f} in fasta_fofn does not exist.".\
                                 format(f=fasta_file)

        if errMsg != "":
            self.add_log(errMsg, level=logging.ERROR)
            raise IOError(errMsg)
示例#19
0
    def create_arrows_bins_no_submit(self, d, uc, partial_uc, refs, cids_todo):
        """
        Create arrow bins for cids in <cids_todo>. Handle missing references, etc.
        Create/Write the jobs but DO NOT submit.
        """
        # Liz: I'm commenting this out because the "refs" from the pickle should be accurate
        # plus the new cids after ice2 collection is b<bin>_c<cid>
        # Update refs
        #new_refs = {cid: op.join(self.cluster_dir(cid), op.basename(refs[cid])) for cid in cids_todo}
        #refs = new_refs

        # Reconstruct refs if not exist.
        cids_missing_refs = filter(lambda x: not nfs_exists(refs[x]),
                                   cids_todo)
        if len(cids_missing_refs) > 0:
            self.reconstruct_ref_fa_for_clusters_in_bin(cids=cids_missing_refs,
                                                        refs=refs)

        return self.create_a_arrow_bin(cids_todo, d, uc, partial_uc, refs)
示例#20
0
    def reconstruct_ref_fa_for_clusters_in_bin(self, cids, refs):
        """
        Reconstruct ref_fa of the cluster in the new tmp_dir
        e.g.,
            self.g_consensus_ref_fa_of_cluster(cid)

        Liz: new cids after ice2 collection is b<bin>_c<cid>
        refs --- dict{int(cid): ref_fa of cluster(cid)}
        """
        # Check existence when first time it is read.
        if not nfs_exists(self.final_consensus_fa):
            raise IOError("Final consensus FASTA file {f}".format(
                f=self.final_consensus_fa) + "does not exist.")

        print("Reconstructing g consensus files for clusters {0}, {1} in {2}".
              format(cids[0], cids[-1], self.tmp_dir))
        self.add_log(
            "Reconstructing g consensus files for clusters {0}, {1} in {2}".
            format(cids[0], cids[-1], self.tmp_dir))

        final_consensus_d = FastaRandomReader(self.final_consensus_fa)
        for ref_id in list(final_consensus_d.d.keys()):
            # Liz: this is no longer valid for the Ice2 cids #cid = int(ref_id.split('/')[0].replace('c', ''))
            cid = ref_id
            if cid in cids:
                _dir = self.cluster_dir_for_reconstructed_ref(cid)
                mkdir(_dir)
                ref_fa = op.join(_dir, op.basename(refs[cid]))
                refs[cid] = ref_fa
                with FastaWriter(ref_fa) as writer:
                    self.add_log("Writing ref_fa %s" % refs[cid])
                    writer.writeRecord(ref_id,
                                       final_consensus_d[ref_id].sequence[:])

        self.add_log("Reconstruct of g consensus files completed.",
                     level=logging.INFO)
示例#21
0
    def check_quiver_jobs_completion(self):
        """Check whether quiver jobs are completed.
        submitted_quiver_jobs.txt should have format like:
        <job_id> \t ./quivered/<range>.sh

        (1) if all jobs are done and files are there return True
        (2) if all jobs are done but some files incomplete ask if to resubmit
        (3) if not all jobs are done, just quit
        fq_filenames contains all the finished fastq files.
        """
        self.add_log("Checking if quiver jobs are completed.")
        done_flag = True
        bad_sh = []
        self.fq_filenames = []
        submitted = {}
        self.add_log("Submitted quiver jobs are at {f}:".
                     format(f=self.submitted_quiver_jobs_log))

        sge_used = False
        with open(self.submitted_quiver_jobs_log, 'r') as f:
            for line in f:
                a, b = line.strip().split('\t')
                if a == 'local':
                    submitted[b] = b
                else:
                    sge_used = True
                    submitted[a] = b

        running_jids = []
        if sge_used is True and self.use_sge is True:
            stuff = os.popen("qstat").read().strip().split('\n')
            # first two lines are header
            for x in stuff[2:]:
                job_id = x.split()[0]
                running_jids.append(job_id)
                if job_id in submitted:
                    self.add_log("job {0} is still running.".format(job_id))
                    done_flag = False

        for job_id, sh_name in submitted.iteritems():
            fq_filename = op.join(self.quivered_dir,
                                  op.basename(sh_name).replace('.sh', '.quivered.fastq'))

            if not nfs_exists(fq_filename) or \
                    os.stat(fq_filename).st_size == 0:
                if job_id in running_jids:  # still running, pass
                    done_flag = False
                else:
                    self.add_log("job {0} is completed but {1} is still empty!".
                                 format(job_id, fq_filename))
                    bad_sh.append(submitted[job_id])
            else:
                self.add_log("job {0} is done".format(job_id))
                self.fq_filenames.append(fq_filename)

        if not done_flag:
            if len(bad_sh) == 0:
                return "RUNNING"
            else:
                self.add_log("The following jobs were completed but " +
                             "no output file. Please check and resubmit: " +
                             "\n{0}\n".format('\n'.join(bad_sh)))
                return "FAILED"
        else:
            return "DONE"
    def check_quiver_jobs_completion(self):
        """Check whether quiver jobs are completed.
        submitted_quiver_jobs.txt should have format like:
        <job_id> \t ./quivered/<range>.sh

        (1) if all jobs are done and files are there return True
        (2) if all jobs are done but some files incomplete ask if to resubmit
        (3) if not all jobs are done, just quit
        fq_filenames contains all the finished fastq files.
        """
        self.add_log("Checking if quiver jobs are completed.")
        done_flag = True
        bad_sh = []
        self.fq_filenames = []
        submitted = {}
        self.add_log("Submitted quiver jobs are at {f}:".
                     format(f=self.submitted_quiver_jobs_log))

        sge_used = False
        with open(self.submitted_quiver_jobs_log, 'r') as f:
            for line in f:
                a, b = line.strip().split('\t')
                if a == 'local':
                    submitted[b] = b
                else:
                    sge_used = True
                    submitted[a] = b

        running_jids = []
        if sge_used is True and self.use_sge is True:
            stuff = os.popen("qstat").read().strip().split('\n')
            # first two lines are header
            for x in stuff[2:]:
                job_id = x.split()[0]
                running_jids.append(job_id)
                if job_id in submitted:
                    self.add_log("job {0} is still running.".format(job_id))
                    done_flag = False

        for job_id, sh_name in submitted.iteritems():
            fq_filename = op.join(self.quivered_dir,
                                  op.basename(sh_name).replace('.sh', '.quivered.fastq'))

            if not nfs_exists(fq_filename) or \
                    os.stat(fq_filename).st_size == 0:
                if job_id in running_jids:  # still running, pass
                    done_flag = False
                else:
                    self.add_log("job {0} is completed but {1} is still empty!".
                                 format(job_id, fq_filename))
                    bad_sh.append(submitted[job_id])
            else:
                self.add_log("job {0} is done".format(job_id))
                self.fq_filenames.append(fq_filename)

        if not done_flag:
            if len(bad_sh) == 0:
                return "RUNNING"
            else:
                self.add_log("The following jobs were completed but " +
                             "no output file. Please check and resubmit: " +
                             "\n{0}\n".format('\n'.join(bad_sh)))
                return "FAILED"
        else:
            return "DONE"
    def check_arrow_jobs_completion(self):
        """Check whether arrow jobs are completed.
        submitted_arrow_jobs.txt should have format like:
        <job_id> \t ./arrowed/c0to10.sh

        Returns:
        "DONE" --- if all jobs are done and files are there return
        "FAILED" --- all jobs are done but some files incomplete ask if to resubmit
        "RUNNING" --- if not all jobs are done, just quit
        fq_filenames contains all the finished fastq files.
        """
        self.add_log("Checking if arrow jobs are completed.")
        bad_sh = []
        self.fq_filenames = []
        self.add_log("Submitted arrow jobs are at {f}:".
                     format(f=self.arrow_submission_run_file))

        # submitted = list of (SGE jobid or local, script file that is running)
        sge_jobids, submitted = self.list_of_expected_arrow_fq_files()

        done_flag = True
        running_jids = []
        # if one or more jobs were submitted through SGE,
        # go through qstat to see if anything is still running
        if len(sge_jobids):  # at least one job was run through SGE
            stuff = os.popen("qstat").read().strip().split('\n')
            assert stuff[0].startswith('job-ID')
            assert stuff[1].startswith('-------')
            # first two lines are header
            for x in stuff[2:]:
                job_id = x.split()[0]
                running_jids.append(job_id)
                if job_id in sge_jobids:
                    self.add_log("job {0} is still running.".format(job_id))
                    done_flag = False

        # now go through all the expected fastq files and check they exist
        for fq_filename,(job_id,sh_file) in submitted.iteritems():
            if not nfs_exists(fq_filename) or \
                    os.stat(fq_filename).st_size == 0:
                if job_id in running_jids:  # still running, pass
                    done_flag = False
                else:
                    self.add_log("job {0} is completed but {1} is still empty!".
                                 format(job_id, fq_filename))
                    bad_sh.append(sh_file)
            else:
                self.add_log("job {0} is done".format(job_id))
                self.fq_filenames.append(fq_filename)

        if not done_flag:
            if len(bad_sh) == 0:
                return "RUNNING"
            else:
                # write the unfinished jobs to $unfinished_arrow_sh_files$
                f = open(self.unfinished_arrow_sh_files, 'w')
                f.write("\n".join(bad_sh) + '\n')
                f.close()
                self.add_log("Some jobs were incomplete! Please re-run all files listed in {1}.\n".format(\
                    len(bad_sh), f.name))
                return "FAILED"
        else:
            return "DONE"
示例#24
0
    def _validate_inputs(self, root_dir, i, ccs_fofn, blasr_nproc, tmp_dir,
                         ref_fasta=None):
        """
        Check inputs, write $ICE_PARTIAL_PY i command to script_file
        and return (input_fasta, ref_fasta, out_pickle, done_file)
        for the i-th chunk of nfl reads.
        """
        icef = IceFiles(prog_name="ice_partial_{i}".format(i=i),
                        root_dir=root_dir, no_log_f=False)

        # root_dir/output/final.consensus.fasta
        ref_fasta = icef.final_consensus_fa
        ref_dazz = icef.final_dazz_db

        # root_dir/output/map_noFL/input.split_{0:03d}.fasta
        input_fasta = icef.nfl_fa_i(i)

        # $input_fasta.partial_uc.pickle
        out_pickle = icef.nfl_pickle_i(i)

        # $input_fasta.partial_uc.pickle.DONE
        done_file = icef.nfl_done_i(i)

        # $input_fasta.partial_uc.sh
        script_file = icef.nfl_script_i(i)

        # Check if inputs exist.
        errMsg = ""
        if not nfs_exists(input_fasta):
            errMsg = ("The {i}-th splitted non-full-length reads ".format(i=i) +
                      "fasta file {f} does not exist. ".format(f=input_fasta) +
                      "Please run $ICE_PARTIAL_PY split first.")
        elif not nfs_exists(ref_fasta):
            # ref_fasta --- root_dir/output/final.consensus.fasta
            # ref_dazz --- root_dir/output/final.consensus.dazz.fasta.db
            # ref_fasta and ref_dazz must exist if ICE has run successfully in
            # root_dir. If either one does not exist, it means ICE has not
            # successfully run in root_dir. Then we have to throw an error message
            # requring users to copy the root_dir/output directory manually,
            # rather than providing an option to overwrite ref_fasta and build
            # ref_dazz, because a race condition can happen when multiple
            # IcePartialI tasks start to run at the same time, which can corrupt
            # fasta and dazz db files and lead to unexpected runtime errors.
            errMsg = ("The unpolished consensus isoforms fasta file " +
                      "{f} does not exist. ".format(f=ref_fasta) +
                      "Please make sure ICE is successfully done in root_dir, " +
                      "or copy ICE output directory (e.g., cluster_out/output) " +
                      "to {dst}".format(dst=op.dirname(ref_fasta)))
        elif not nfs_exists(ref_dazz):
            errMsg = ("The dazz db " +
                      "{f} does not exist. ".format(f=ref_dazz) +
                      "Please make sure it is already built.")
        if len(errMsg) != 0:
            raise IOError(errMsg)

        # Save cmd to script_file.
        cmd = self._cmd_str(root_dir=root_dir, i=[i],
                            ccs_fofn=ccs_fofn,
                            blasr_nproc=blasr_nproc,
                            tmp_dir=tmp_dir)
        with open(script_file, 'w') as writer:
            writer.write(cmd + "\n")

        icef.add_log("Writing CMD to: {script_file}".
                     format(script_file=script_file))
        icef.close_log()

        return (input_fasta, ref_fasta, out_pickle, done_file)
示例#25
0
    def check_arrow_jobs_completion(self):
        """Check whether arrow jobs are completed.
        submitted_arrow_jobs.txt should have format like:
        <job_id> \t ./arrowed/c0to10.sh

        Returns:
        "DONE" --- if all jobs are done and files are there return
        "FAILED" --- all jobs are done but some files incomplete ask if to resubmit
        "RUNNING" --- if not all jobs are done, just quit
        fq_filenames contains all the finished fastq files.
        """
        self.add_log("Checking if arrow jobs are completed.")
        bad_sh = []
        self.fq_filenames = []
        self.add_log("Submitted arrow jobs are at {f}:".format(
            f=self.arrow_submission_run_file))

        # submitted = list of (SGE jobid or local, script file that is running)
        sge_jobids, submitted = self.list_of_expected_arrow_fq_files()

        done_flag = True
        running_jids = []
        # if one or more jobs were submitted through SGE,
        # go through qstat to see if anything is still running
        if len(sge_jobids):  # at least one job was run through SGE
            stuff = os.popen("qstat").read().strip().split('\n')
            assert stuff[0].startswith('job-ID')
            assert stuff[1].startswith('-------')
            # first two lines are header
            for x in stuff[2:]:
                job_id = x.split()[0]
                running_jids.append(job_id)
                if job_id in sge_jobids:
                    self.add_log("job {0} is still running.".format(job_id))
                    done_flag = False

        # now go through all the expected fastq files and check they exist
        for fq_filename, (job_id, sh_file) in submitted.iteritems():
            if not nfs_exists(fq_filename) or \
                    os.stat(fq_filename).st_size == 0:
                if job_id in running_jids:  # still running, pass
                    done_flag = False
                else:
                    self.add_log(
                        "job {0} is completed but {1} is still empty!".format(
                            job_id, fq_filename))
                    bad_sh.append(sh_file)
            else:
                self.add_log("job {0} is done".format(job_id))
                self.fq_filenames.append(fq_filename)

        if not done_flag:
            if len(bad_sh) == 0:
                return "RUNNING"
            else:
                # write the unfinished jobs to $unfinished_arrow_sh_files$
                f = open(self.unfinished_arrow_sh_files, 'w')
                f.write("\n".join(bad_sh) + '\n')
                f.close()
                self.add_log("Some jobs were incomplete! Please re-run all files listed in {1}.\n".format(\
                    len(bad_sh), f.name))
                return "FAILED"
        else:
            return "DONE"