Пример #1
0
    def _validate_inputs(self, root_dir, nfl_fa, N):
        """
        Check inputs, return
        (num_reads,
         number_reads_per_chunk,
         nfl_dir,
         [i-th_chunk_nfl_fa for i in [0...N-1]])
        """
        icef = IceFiles(prog_name="ice_partial_split",
                        root_dir=root_dir,
                        no_log_f=False)

        nfl_dir = icef.nfl_dir

        # root_dir/output/map_noFL/input.split_{0:03d}.fasta
        splitted_nfl_fas = [icef.nfl_fa_i(i) for i in range(0, N)]

        mkdir(icef.nfl_dir)

        # Check if inputs exist.
        errMsg = ""
        if not nfs_exists(nfl_fa):
            errMsg = ("The input non-full-length reads fasta file " +
                      "{f} does not exists. ".format(f=nfl_fa))
        if len(errMsg) != 0:
            raise ValueError(errMsg)

        num_reads = num_reads_in_fasta(nfl_fa)
        reads_per_split = int(max(1, ceil(num_reads * 1.0 / N)))

        return (num_reads, reads_per_split, nfl_dir, splitted_nfl_fas)
Пример #2
0
    def test_runner(self):
        """Test CombineRunner."""
        ipq_opts = IceQuiverHQLQOptions(qv_trim_5=100, qv_trim_3=30)
        d = op.join(SIV_DATA_DIR, "test_tool_contract_chunks")
        split_dirs = [op.join(d, b, "cluster_out") for b in
                      ("0to1kb_part0", "1to2kb_part0", "2to3kb_part0", "3to4kb_part0", "4to5kb_part0")]
        print split_dirs
        out_combined_dir = op.join(OUT_DIR, "test_CombineUtils", "combined_dir")
        rmpath(out_combined_dir)
        mkdir(out_combined_dir)
        obj = CombineRunner(combined_dir=out_combined_dir,
                            sample_name="mysample",
                            split_dirs=split_dirs,
                            ipq_opts=ipq_opts)
        obj.run()

        expected_out_fns = (obj.all_hq_fa, obj.all_hq_fq, obj.all_lq_fa, obj.all_lq_fq,
                            obj.all_consensus_isoforms_fa,
                            obj.all_cluster_report_fn, obj.all_cluster_summary_fn)
        self.assertTrue(all([op.exists(f) for f in expected_out_fns]))

        expected_hq_isoforms = ['i1_HQ_mysample|c0/f2p16/1826', 'i2_HQ_mysample|c2/f9p14/2470',
                                'i2_HQ_mysample|c5/f7p19/2472', 'i2_HQ_mysample|c10/f8p16/2457',
                                'i2_HQ_mysample|c98/f2p10/2081', 'i2_HQ_mysample|c108/f23p28/2471']
        self.assertEqual([r.name.split(' ')[0] for r in FastaReader(obj.all_hq_fa)], expected_hq_isoforms)
        self.assertEqual([r.name.split(' ')[0] for r in FastqReader(obj.all_hq_fq)], expected_hq_isoforms)

        expected_lq_isoforms_num = 73
        self.assertEqual(len([r for r in FastaReader(obj.all_lq_fa)]), expected_lq_isoforms_num)

        expected_consensus_isoforms_num = 79
        self.assertEqual(len([r for r in FastaReader(obj.all_consensus_isoforms_fa)]), expected_consensus_isoforms_num)
Пример #3
0
 def __init__(self, input_fasta, reads_per_split, out_dir, out_prefix):
     self.input_fasta = input_fasta
     self.out_dir = out_dir
     self.reads_per_split = reads_per_split  # Number of reads per split
     self.out_prefix = out_prefix
     self.out_fns = None
     mkdir(self.out_dir)
def make_sane(args):
    """Make sane of input output"""
    args.smrtlink_job_dir = realpath(args.smrtlink_job_dir)
    args.out_dir = realpath(args.out_dir)

    if args.gmap_db is None:
        args.gmap_db = realpath(GMAP_DB)
        log.warning("Reset GMAP DB to %s", args.gmap_db)

    if args.gmap_name is None:
        args.gmap_name = GMAP_NAME
        log.warning("Reset GMAP NAME to %s", args.gmap_name)

    if not op.exists(args.smrtlink_job_dir):
        raise IOError("SMRTLink job directory %s does not exist" % args.smrtlink_job_dir)

    if not op.exists(op.join(args.gmap_db, args.gmap_name)):
        raise IOError("GMAP reference %s/%s does not exist." % (args.gmap_db, args.gmap_name))

    if not op.exists(args.gencode_gtf):
        raise IOError("Gencode gtf file %s does not exist." % args.gencode_gtf)

    log.info("Making out_dir %s", args.out_dir)
    mkdir(args.out_dir)
    return args
Пример #5
0
def convert_fofn_to_fasta(fofn_filename,
                          out_filename,
                          fasta_out_dir,
                          force_overwrite=False):
    """
    For each .bax.h5 file, create .bax.h5.fasta file and save paths to
    out_filename, which should usually be 'input.fasta.fofn'
    Modified: 09/14/2015, both ends of subreads in fasta files will
    be trimmed in IceQuiver (trim_and_write_raw_file) instead of here.
    """
    logging.info("Converting fofn {fofn} to fasta.".format(fofn=fofn_filename))
    in_fns = get_files_from_file_or_fofn(fofn_filename)
    out_fns = []
    mkdir(fasta_out_dir)
    for in_fn in in_fns:
        logging.debug("converting h5 file: {f}.".format(f=in_fn))
        if not (in_fn.endswith('.bax.h5') or in_fn.endswith('.bas.h5')):
            raise ValueError("fofn file {fofn} ".format(fofn=fofn_filename) +
                             "should only contain bax/bas.h5 files.")

        # e.g. m111xxxx.1.bax.h5 ==>
        #      tmp_out_file = m11xxxx.1.bax.h5.fasta.tmp
        #      out_file = m11xxxx.1.bax.h5.fasta
        in_basename = op.basename(in_fn)
        out_file = op.join(fasta_out_dir, in_basename + '.fasta')
        if op.exists(out_file) and not force_overwrite:
            logging.debug(
                "File {0} already exists. skipping.".format(out_file))
        else:
            cmd = "pls2fasta {in_fn} ".format(in_fn=real_upath(in_fn)) + \
                  " {out} ".format(out=real_upath(out_file)) + \
                  "-minSubreadLength 300 -minReadScore 750 -trimByRegion"
            execute(cmd=cmd)
        out_fns.append(out_file)
    write_files_to_fofn(out_fns, out_filename)
Пример #6
0
 def make_cluster_out_dir(in_dir, root_dir):
     bin_name = op.basename(op.dirname(in_dir))
     new_dir = op.join(
         root_dir, bin_name,
         "cluster_out")  #e.g., root_dir/0to1kb_part0/cluster_out
     mkdir(new_dir)
     return new_dir
Пример #7
0
    def __init__(self,
                 root_dir,
                 fasta_filenames,
                 fastq_filenames,
                 ref_fasta,
                 out_pickle,
                 ice_opts,
                 sge_opts,
                 cpus,
                 tmp_dir=None):
        """
        fasta_filenames --- a list of splitted nfl fasta files.

        ref_fasta --- (unpolished) consensus isoforms

        out_pickle --- a pickle file with all nfl fasta reads

        root_dir --- ICE root output directory

        tmp_dir --- if not None, write temporary clusters, dazz, las
                    files to the given temporaray directory

        sge_opts --- params for SGE environment, including
            use_sge    : use SGE or not
            max_sge_jobs: maximum number of sub-jobs submitted
            unique_id  : unique qsub job id, important that this
                        DOES NOT CONFLICT!

        """
        self.prog_name = "IceAllPartials"
        IceFiles.__init__(self,
                          prog_name=self.prog_name,
                          root_dir=root_dir,
                          tmp_dir=tmp_dir)

        self.fasta_filenames, self.ref_fasta = \
            self._validate_inputs(fasta_filenames=fasta_filenames,
                                  ref_fasta=ref_fasta)

        if fastq_filenames is not None:
            for fq in fastq_filenames:
                assert op.exists(fq)

        self.fastq_filenames = fastq_filenames  # note: could be None

        self.out_pickle = out_pickle

        self.ice_opts = ice_opts
        self.sge_opts = sge_opts
        self.cpus = cpus  # this is the number of CPUs to use per SGE job or per local job

        self.add_log("Making dir for mapping noFL reads: " + self.nfl_dir)
        mkdir(self.nfl_dir)

        self.add_log("input fasta files are: " +
                     ", ".join(self.fasta_filenames))
        self.add_log("temp pickle files are: " +
                     ", ".join(self.pickle_filenames))
        self.add_log("out pickle file is: " + self.out_pickle)
        self.add_log("temp directory is: " + str(self.tmp_dir))
Пример #8
0
    def reconstruct_ref_fa_for_clusters_in_bin(self, cids, refs):
        """
        Reconstruct ref_fa of the cluster in the new tmp_dir
        e.g.,
            self.g_consensus_ref_fa_of_cluster(cid)

        cids --- list[int(cid)], e.g., [10, 11, 12, ..., 20]
        refs --- dict{int(cid): ref_fa of cluster(cid)}
        """
        # Check existence when first time it is read.
        if not nfs_exists(self.final_consensus_fa):
            raise IOError("Final consensus FASTA file {f}".format(
                f=self.final_consensus_fa) + "does not exist.")

        self.add_log("Reconstructing g consensus files for clusters "
                     "[%d, %d] in %s" % (cids[0], cids[-1], self.tmp_dir),
                     level=logging.INFO)

        final_consensus_d = FastaRandomReader(self.final_consensus_fa)
        for ref_id in final_consensus_d.d.keys():
            cid = int(ref_id.split('/')[0].replace('c', ''))
            # e.g., ref_id = c103/1/3708, cid = 103,
            #       refs[cid] = ...tmp/0/c103/g_consensus_ref.fasta
            if cid in cids:
                mkdir(self.cluster_dir(cid))
                ref_fa = op.join(self.cluster_dir(cid), op.basename(refs[cid]))
                refs[cid] = ref_fa
                with FastaWriter(ref_fa) as writer:
                    self.add_log("Writing ref_fa %s" % refs[cid])
                    writer.writeRecord(ref_id,
                                       final_consensus_d[ref_id].sequence[:])

        self.add_log("Reconstruct of g consensus files completed.",
                     level=logging.INFO)
Пример #9
0
def sanity_check_daligner(scriptDir, testDirName="daligner_test_dir"):
    """
    Run daligner on gcon_in.fa, but don't care about results.
    Just make sure it runs.
    """
    scriptDir = realpath(scriptDir)
    testDir = op.join(scriptDir, testDirName)

    mkdir(scriptDir)
    mkdir(testDir)

    testInFa = op.join(testDir, "daligner.fasta")
    if op.exists(testInFa):
        os.remove(testInFa)
    shutil.copy(GCON_IN_FA, testInFa)
    assert op.exists(testInFa)

    runner = DalignerRunner(query_filename=testInFa,
                            target_filename=testInFa,
                            is_FL=True, same_strand_only=True,
                            query_converted=False, target_converted=False,
                            use_sge=False, cpus=4, sge_opts=None)
    runner.run(output_dir=testDir, min_match_len=300, sensitive_mode=False)
    runner.clean_run()

    shutil.rmtree(testDir)
    logging.info("daligner check passed.")
    return True
Пример #10
0
def convert_fofn_to_fasta(fofn_filename, out_filename, fasta_out_dir,
                          force_overwrite=False):
    """
    For each .bax.h5 file, create .bax.h5.fasta file and save paths to
    out_filename, which should usually be 'input.fasta.fofn'
    Modified: 09/14/2015, both ends of subreads in fasta files will
    be trimmed in IceQuiver (trim_and_write_raw_file) instead of here.
    """
    logging.info("Converting fofn {fofn} to fasta.".format(fofn=fofn_filename))
    in_fns = get_files_from_file_or_fofn(fofn_filename)
    out_fns = []
    mkdir(fasta_out_dir)
    for in_fn in in_fns:
        logging.debug("converting h5 file: {f}.".format(f=in_fn))
        if not (in_fn.endswith('.bax.h5') or in_fn.endswith('.bas.h5')):
            raise ValueError("fofn file {fofn} ".format(fofn=fofn_filename) +
                             "should only contain bax/bas.h5 files.")

        # e.g. m111xxxx.1.bax.h5 ==>
        #      tmp_out_file = m11xxxx.1.bax.h5.fasta.tmp
        #      out_file = m11xxxx.1.bax.h5.fasta
        in_basename = op.basename(in_fn)
        out_file = op.join(fasta_out_dir, in_basename + '.fasta')
        if op.exists(out_file) and not force_overwrite:
            logging.debug("File {0} already exists. skipping.".format(out_file))
        else:
            cmd = "pls2fasta {in_fn} ".format(in_fn=real_upath(in_fn)) + \
                  " {out} ".format(out=real_upath(out_file)) + \
                  "-minSubreadLength 300 -minReadScore 750 -trimByRegion"
            execute(cmd=cmd)
        out_fns.append(out_file)
    write_files_to_fofn(out_fns, out_filename)
Пример #11
0
 def __init__(self, input_fasta, reads_per_split, out_dir, out_prefix):
     self.input_fasta = input_fasta
     self.out_dir = out_dir
     self.reads_per_split = reads_per_split  # Number of reads per split
     self.out_prefix = out_prefix
     self.out_fns = None
     mkdir(self.out_dir)
def make_sane(args):
    """Make sane of input output"""
    args.smrtlink_job_dir = realpath(args.smrtlink_job_dir)
    args.out_dir = realpath(args.out_dir)

    if args.gmap_db is None:
        args.gmap_db = realpath(GMAP_DB)
        log.warning("Reset GMAP DB to %s", args.gmap_db)

    if args.gmap_name is None:
        args.gmap_name = GMAP_NAME
        log.warning("Reset GMAP NAME to %s", args.gmap_name)

    if not op.exists(args.smrtlink_job_dir):
        raise IOError("SMRTLink job directory %s does not exist" %
                      args.smrtlink_job_dir)

    if not op.exists(op.join(args.gmap_db, args.gmap_name)):
        raise IOError("GMAP reference %s/%s does not exist." %
                      (args.gmap_db, args.gmap_name))

    if not op.exists(args.gencode_gtf):
        raise IOError("Gencode gtf file %s does not exist." % args.gencode_gtf)

    log.info("Making out_dir %s", args.out_dir)
    mkdir(args.out_dir)
    return args
Пример #13
0
    def reconstruct_ref_fa_for_clusters_in_bin(self, cids, refs):
        """
        Reconstruct ref_fa of the cluster in the new tmp_dir
        e.g.,
            self.g_consensus_ref_fa_of_cluster(cid)

        cids --- list[int(cid)], e.g., [10, 11, 12, ..., 20]
        refs --- dict{int(cid): ref_fa of cluster(cid)}
        """
        # Check existence when first time it is read.
        if not nfs_exists(self.final_consensus_fa):
            raise IOError("Final consensus FASTA file {f}".format(
                f=self.final_consensus_fa) + "does not exist.")

        self.add_log("Reconstructing g consensus files for clusters "
                     "[%d, %d] in %s" % (cids[0], cids[-1], self.tmp_dir),
                     level=logging.INFO)

        final_consensus_d = FastaRandomReader(self.final_consensus_fa)
        for ref_id in final_consensus_d.d.keys():
            cid = int(ref_id.split('/')[0].replace('c', ''))
            # e.g., ref_id = c103/1/3708, cid = 103,
            #       refs[cid] = ...tmp/0/c103/g_consensus_ref.fasta
            if cid in cids:
                mkdir(self.cluster_dir(cid))
                ref_fa = op.join(self.cluster_dir(cid),
                                 op.basename(refs[cid]))
                refs[cid] = ref_fa
                with FastaWriter(ref_fa) as writer:
                    self.add_log("Writing ref_fa %s" % refs[cid])
                    writer.writeRecord(ref_id,
                                       final_consensus_d[ref_id].sequence[:])

        self.add_log("Reconstruct of g consensus files completed.",
                     level=logging.INFO)
Пример #14
0
    def _validate_inputs(self, root_dir, nfl_fa, N):
        """
        Check inputs, return
        (num_reads,
         number_reads_per_chunk,
         nfl_dir,
         [i-th_chunk_nfl_fa for i in [0...N-1]])
        """
        icef = IceFiles(prog_name="ice_partial_split",
                        root_dir=root_dir, no_log_f=False)

        nfl_dir = icef.nfl_dir

        # root_dir/output/map_noFL/input.split_{0:03d}.fasta
        splitted_nfl_fas = [icef.nfl_fa_i(i) for i in range(0, N)]

        mkdir(icef.nfl_dir)

        # Check if inputs exist.
        errMsg = ""
        if N <= 0 or N > 100:
            errMsg = "Input file can not be splitted into %d chunks!" % N

        if not nfs_exists(nfl_fa):
            errMsg = ("The input non-full-length reads fasta file " +
                      "{f} does not exists. ".format(f=nfl_fa))
        if len(errMsg) != 0:
            raise ValueError(errMsg)

        num_reads = num_reads_in_fasta(nfl_fa)
        reads_per_split = int(max(1, ceil(num_reads * 1.0 / N)))

        return (num_reads, reads_per_split, nfl_dir, splitted_nfl_fas)
Пример #15
0
def sanity_check_daligner(scriptDir, testDirName="daligner_test_dir"):
    """
    Run daligner on gcon_in.fa, but don't care about results.
    Just make sure it runs.
    """
    scriptDir = realpath(scriptDir)
    testDir = op.join(scriptDir, testDirName)

    mkdir(scriptDir)
    mkdir(testDir)

    testInFa = op.join(testDir, "daligner.fasta")
    if op.exists(testInFa):
        os.remove(testInFa)
    shutil.copy(GCON_IN_FA, testInFa)
    assert op.exists(testInFa)

    runner = DalignerRunner(query_filename=testInFa,
                            target_filename=testInFa,
                            is_FL=True,
                            same_strand_only=True,
                            query_converted=False,
                            target_converted=False,
                            use_sge=False,
                            cpus=4,
                            sge_opts=None)
    runner.run(output_dir=testDir, min_match_len=300, sensitive_mode=False)
    runner.clean_run()

    shutil.rmtree(testDir)
    logging.info("daligner check passed.")
    return True
Пример #16
0
 def __init__(self, input_fa_or_fq, reads_per_split, out_dir, out_format,
              is_fq):
     self.input_fa_or_fq = input_fa_or_fq
     self.is_fq = is_fq
     self.out_dir = out_dir
     self.reads_per_split = reads_per_split  # Number of reads per split
     self.out_format = out_format
     self.out_fns = None
     mkdir(self.out_dir)
Пример #17
0
    def __enter__(self):
        # make a sub dir for each separation criteria
        for d in self.out_dirs:
            mkdir(d)

        # open all fasta file handlers
        for index, key in enumerate(self.sorted_keys):
            self.handles[key] = open(self.out_fasta_files[index], 'w')

        return self
Пример #18
0
    def __enter__(self):
        # make a sub dir for each separation criteria
        for d in self.out_dirs:
            mkdir(d)

        # open all fasta file handlers
        for index, key in enumerate(self.sorted_keys):
            self.handles[key] = open(self.out_fasta_files[index], 'w')

        return self
Пример #19
0
    def __init__(self,
                 root_dir,
                 fasta_filenames,
                 ref_fasta,
                 out_pickle,
                 sge_opts,
                 ccs_fofn=None,
                 tmp_dir=None):
        """
        fasta_filenames --- a list of splitted nfl fasta files.

        ref_fasta --- (unpolished) consensus isoforms

        out_pickle --- a pickle file with all nfl fasta reads

        ccs_fofn --- should be reads_of_insert.fofn or None

        root_dir --- ICE root output directory

        tmp_dir --- if not None, write temporary clusters, dazz, las
                    files to the given temporaray directory

        sge_opts --- params for SGE environment, including
            use_sge    : use SGE or not
            max_sge_jobs: maximum number of sub-jobs submitted
            unique_id  : unique qsub job id, important that this
                        DOES NOT CONFLICT!
            blasr_nproc: blasr -nproc param, number of threads per cpu.

        """
        self.prog_name = "IceAllPartials"
        IceFiles.__init__(self,
                          prog_name=self.prog_name,
                          root_dir=root_dir,
                          tmp_dir=tmp_dir)

        self.fasta_filenames, self.ref_fasta, self.ccs_fofn, = \
            self._validate_inputs(fasta_filenames=fasta_filenames,
                                  ref_fasta=ref_fasta,
                                  ccs_fofn=ccs_fofn)

        self.out_pickle = out_pickle

        self.sge_opts = sge_opts

        self.add_log("Making dir for mapping noFL reads: " + self.nfl_dir)
        mkdir(self.nfl_dir)

        self.add_log("input fasta files are: " +
                     ", ".join(self.fasta_filenames))
        self.add_log("temp pickle files are: " +
                     ", ".join(self.pickle_filenames))
        self.add_log("out pickle file is: " + self.out_pickle)
        self.add_log("temp directory is: " + str(self.tmp_dir))
Пример #20
0
    def validate_inputs(self):
        """Validate input fofns, and root_dir, log_dir, tmp_dir,
        create quivered_dir and quivered_log_dir"""
        self.add_log("Validating inputs.")

        # Create directories: root_dir/quivered and root_dir/log_dir/quivered
        try:
            mkdir(self.quivered_dir)
            mkdir(self.quivered_log_dir)
        except OSError:
            # Multiple ice_quiver_i jobs may run at the same time and try to
            # mkdir, race condition may happen, so ignore OSError here.
            pass

        errMsg = ""

        if not nfs_exists(self.log_dir) or not op.isdir(self.log_dir):
            errMsg = "Log dir {l} is not an existing directory.".\
                format(l=self.log_dir)
        elif self.bas_fofn is None:
            errMsg = "Please specify subreads file (e.g., --bas_fofn=input.fofn|subreadset.xml)."
        elif not nfs_exists(self.bas_fofn):
            errMsg = "Specified subreads file (bas_fofn={f}) does not exist.".format(
                f=self.bas_fofn)
        elif not nfs_exists(self.nfl_all_pickle_fn):
            #"output/map_noFL/noFL.ALL.partial_uc.pickle"):
            errMsg = "Pickle file {f} ".format(f=self.nfl_all_pickle_fn) + \
                     "which assigns all non-full-length reads to isoforms " + \
                     "does not exist. Please check 'ice_partial.py *' are " + \
                     "all done."
        elif not nfs_exists(self.final_pickle_fn):
            errMsg = "Pickle file {f} ".format(f=self.final_pickle_fn) + \
                     "which assigns full-length non-chimeric reads to " + \
                     "isoforms does not exist."

        if self.bas_fofn is not None and \
            guess_file_format(self.bas_fofn) is not FILE_FORMATS.BAM:
            # No need to convert subreads.bam to fasta
            if self.fasta_fofn is None:
                errMsg = "Please make sure ice_make_fasta_fofn has " + \
                         "been called, and specify fasta_fofn."
            elif not nfs_exists(self.fasta_fofn):
                errMsg = "Input fasta_fofn {f} does not exists.".\
                         format(f=self.fasta_fofn)
                fasta_files = get_files_from_file_or_fofn(self.fasta_fofn)
                for fasta_file in fasta_files:
                    if not nfs_exists(fasta_file):
                        errMsg = "A file {f} in fasta_fofn does not exist.".\
                                 format(f=fasta_file)

        if errMsg != "":
            self.add_log(errMsg, level=logging.ERROR)
            raise IOError(errMsg)
Пример #21
0
    def validate_inputs(self):
        """Validate input fofns, and root_dir, log_dir, tmp_dir,
        create quivered_dir and quivered_log_dir"""
        self.add_log("Validating inputs.")

        # Create directories: root_dir/quivered and root_dir/log_dir/quivered
        try:
            mkdir(self.quivered_dir)
            mkdir(self.quivered_log_dir)
        except OSError:
            # Multiple ice_quiver_i jobs may run at the same time and try to
            # mkdir, race condition may happen, so ignore OSError here.
            pass

        errMsg = ""

        if not nfs_exists(self.log_dir) or not op.isdir(self.log_dir):
            errMsg = "Log dir {l} is not an existing directory.".\
                format(l=self.log_dir)
        elif self.bas_fofn is None:
            errMsg = "Please specify subreads file (e.g., --bas_fofn=input.fofn|subreadset.xml)."
        elif not nfs_exists(self.bas_fofn):
            errMsg = "Specified subreads file (bas_fofn={f}) does not exist.".format(f=self.bas_fofn)
        elif not nfs_exists(self.nfl_all_pickle_fn):
            #"output/map_noFL/noFL.ALL.partial_uc.pickle"):
            errMsg = "Pickle file {f} ".format(f=self.nfl_all_pickle_fn) + \
                     "which assigns all non-full-length reads to isoforms " + \
                     "does not exist. Please check 'ice_partial.py *' are " + \
                     "all done."
        elif not nfs_exists(self.final_pickle_fn):
            errMsg = "Pickle file {f} ".format(f=self.final_pickle_fn) + \
                     "which assigns full-length non-chimeric reads to " + \
                     "isoforms does not exist."

        if self.bas_fofn is not None and \
            guess_file_format(self.bas_fofn) is not FILE_FORMATS.BAM:
            # No need to convert subreads.bam to fasta
            if self.fasta_fofn is None:
                errMsg = "Please make sure ice_make_fasta_fofn has " + \
                         "been called, and specify fasta_fofn."
            elif not nfs_exists(self.fasta_fofn):
                errMsg = "Input fasta_fofn {f} does not exists.".\
                         format(f=self.fasta_fofn)
                fasta_files = get_files_from_file_or_fofn(self.fasta_fofn)
                for fasta_file in fasta_files:
                    if not nfs_exists(fasta_file):
                        errMsg = "A file {f} in fasta_fofn does not exist.".\
                                 format(f=fasta_file)

        if errMsg != "":
            self.add_log(errMsg, level=logging.ERROR)
            raise IOError(errMsg)
Пример #22
0
    def __init__(self,
                 prog_name,
                 root_dir,
                 bas_fofn=None,
                 ccs_fofn=None,
                 fasta_fofn=None,
                 no_log_f=False,
                 tmp_dir=None,
                 make_dirs=True):
        """
        prog_name --- name of a sub-class
        root_dir --- root directory of the whole project. There will be
                     sub-directories under it, including:
                     tmp/ --- 0/  c0, c1, ..., c9999
                          --- 1/  c10000, c10001, ..., c19999
                          ...
                          each c? folder contains data for a cluster id=c?
                     script/
                          --- 0/  gcon_job_?.sh, gcon jobs in the first iteration
                          --- 1/  gcon_job_?.sh, gcon jobs in the second iteration
                          ...
                     log/
                          --- ICE.log   Log of the ICE algorithm
                          --- 0/  log for jobs in the first iteration
                          ...
                     output/   output files go here.
        bas_fofn --- input.fofn which contains movie.bas|bax.h5 files.
        ccs_fofn --- a fofn contains movie.ccs.h5 files.
        fasta_fofn --- a fofn contains movie.bax.h5.fasta files.
                     script/
        no_log_f --- DON'T write log to a log file.
        tmp_dir --- Write temporary files to tmp_dir (usually /scratch) for speed
        """
        self.prog_name = str(prog_name)
        self.root_dir = real_ppath(root_dir)
        self._tmp_dir = real_ppath(tmp_dir)

        self.bas_fofn = real_ppath(bas_fofn)
        self.ccs_fofn = real_ppath(ccs_fofn)
        self.fasta_fofn = real_ppath(fasta_fofn)

        if make_dirs is True:
            mkdir(self.root_dir)
            mkdir(self.tmp_dir)
            mkdir(self.log_dir)
            mkdir(self.script_dir)
            mkdir(self.out_dir)

        self.no_log_f = no_log_f
        if not no_log_f:
            self.log_f = open(self.log_fn, 'w', 0)
            self.add_log(msg="{p} initialized.".format(p=self.prog_name))
Пример #23
0
 def __init__(self, root_dir, subread_set, nproc):
     tmp_dir = op.join(root_dir, "tmp")
     mkdir(tmp_dir)
     super(IceQuiverRTC,
           self).__init__(root_dir=root_dir,
                          bas_fofn=subread_set,
                          fasta_fofn=None,
                          sge_opts=SgeOptions(unique_id=12345,
                                              use_sge=False,
                                              max_sge_jobs=0,
                                              blasr_nproc=nproc,
                                              quiver_nproc=nproc),
                          prog_name="IceQuiver")
Пример #24
0
    def __init__(self, root_dir, fasta_filenames, fastq_filenames, ref_fasta,
                 out_pickle, ice_opts, sge_opts, cpus, tmp_dir=None):
        """
        fasta_filenames --- a list of splitted nfl fasta files.

        ref_fasta --- (unpolished) consensus isoforms

        out_pickle --- a pickle file with all nfl fasta reads

        root_dir --- ICE root output directory

        tmp_dir --- if not None, write temporary clusters, dazz, las
                    files to the given temporaray directory

        sge_opts --- params for SGE environment, including
            use_sge    : use SGE or not
            max_sge_jobs: maximum number of sub-jobs submitted
            unique_id  : unique qsub job id, important that this
                        DOES NOT CONFLICT!

        """
        self.prog_name = "IceAllPartials"
        IceFiles.__init__(self, prog_name=self.prog_name, root_dir=root_dir, tmp_dir=tmp_dir)

        self.fasta_filenames, self.ref_fasta = \
            self._validate_inputs(fasta_filenames=fasta_filenames,
                                  ref_fasta=ref_fasta)

        if fastq_filenames is not None:
            for fq in fastq_filenames:
                assert op.exists(fq)

        self.fastq_filenames = fastq_filenames # note: could be None


        self.out_pickle = out_pickle

        self.ice_opts = ice_opts
        self.sge_opts = sge_opts
        self.cpus = cpus # this is the number of CPUs to use per SGE job or per local job


        self.add_log("Making dir for mapping noFL reads: " + self.nfl_dir)
        mkdir(self.nfl_dir)

        self.add_log("input fasta files are: " +
                     ", ".join(self.fasta_filenames))
        self.add_log("temp pickle files are: " +
                     ", ".join(self.pickle_filenames))
        self.add_log("out pickle file is: " + self.out_pickle)
        self.add_log("temp directory is: " + str(self.tmp_dir))
 def _cp(task, new_task, copied_files, copy_consensus_isoforms=copy_consensus_isoforms,
         copy_flnc_pickle=copy_flnc_pickle, copy_nfl_pickle=copy_nfl_pickle):
     """Copy task.files to new_task.files."""
     if copy_consensus_isoforms is True and new_task.consensus_isoforms_file not in copied_files:
         shutil.copy(task.consensus_isoforms_file, new_task.consensus_isoforms_file)
         copied_files[new_task.consensus_isoforms_file] = True
     if copy_flnc_pickle is True and new_task.flnc_pickle not in copied_files:
         mkdir(op.dirname(new_task.flnc_pickle))
         shutil.copy(task.flnc_pickle, new_task.flnc_pickle)
         copied_files[new_task.flnc_pickle] = True
     if copy_nfl_pickle is True and new_task.nfl_pickle not in copied_files:
         mkdir(op.dirname(new_task.nfl_pickle))
         shutil.copy(task.nfl_pickle, new_task.nfl_pickle)
         copied_files[new_task.nfl_pickle] = True
Пример #26
0
 def __init__(self, root_dir, subread_set, nproc):
     tmp_dir = op.join(root_dir, "tmp")
     mkdir(tmp_dir)
     super(IceQuiverRTC, self).__init__(
         root_dir=root_dir,
         bas_fofn=subread_set,
         fasta_fofn=None,
         sge_opts=SgeOptions(
             unique_id=12345,
             use_sge=False,
             max_sge_jobs=0,
             blasr_nproc=nproc,
             quiver_nproc=nproc),
         prog_name="IceQuiver")
Пример #27
0
def resolved_tool_contract_to_args(resolved_tool_contract):
    """Convert resolved tool contract to args."""
    rtc = resolved_tool_contract
    args = [
        "--verbose",
        "classify",
        resolved_tool_contract.task.input_files[0],
        resolved_tool_contract.task.output_files[0],
        "--flnc",
        resolved_tool_contract.task.output_files[1],
        "--nfl",
        resolved_tool_contract.task.output_files[2],
        "--summary",
        resolved_tool_contract.task.output_files[3],  # JSON
        "--report",
        resolved_tool_contract.task.output_files[4],  # CSV
        "--min_seq_len",
        str(rtc.task.options[Constants.MIN_SEQ_LEN_ID]),
        "--cpus",
        str(resolved_tool_contract.task.nproc),
        "--outDir",
        op.dirname(rtc.task.output_files[0]),
        "--ignore-empty-output",
    ]
    if rtc.task.options[Constants.IGNORE_POLYA_ID]:
        args.append("--ignore_polyA")

    primers_str_obj = rtc.task.options[Constants.PRIMER_SEQUENCES_ID]
    primers_str = str(primers_str_obj).strip().translate(None, '\'\" ')
    if primers_str_obj is not None and primers_str not in ('None', ''):
        logging.info("Detected customer primer: %s", primers_str)
        # Save primer sequences to a fasta file under output dir
        primer_fasta_records = parse_primer_sequences(primers_str=primers_str)
        d = op.dirname(resolved_tool_contract.task.output_files[2])
        mkdir(d)
        primer_fn = op.join(d, "customer_primers.fasta")
        with FastaWriter(primer_fn) as writer:
            for record in primer_fasta_records:
                writer.writeRecord(record)
        logging.info("Customer primer sequences written to file %s", primer_fn)
        args.append("-p")
        args.append("%s" % primer_fn)
    else:
        logging.info("No customer primer detected.")

    return get_argument_parser().parse_args(args)
Пример #28
0
    def __init__(self, prog_name, root_dir,
                 bas_fofn=None, ccs_fofn=None, fasta_fofn=None,
                 no_log_f=False, tmp_dir=None, make_dirs=True):
        """
        prog_name --- name of a sub-class
        root_dir --- root directory of the whole project. There will be
                     sub-directories under it, including:
                     tmp/ --- 0/  c0, c1, ..., c9999
                          --- 1/  c10000, c10001, ..., c19999
                          ...
                          each c? folder contains data for a cluster id=c?
                     script/
                          --- 0/  gcon_job_?.sh, gcon jobs in the first iteration
                          --- 1/  gcon_job_?.sh, gcon jobs in the second iteration
                          ...
                     log/
                          --- ICE.log   Log of the ICE algorithm
                          --- 0/  log for jobs in the first iteration
                          ...
                     output/   output files go here.
        bas_fofn --- input.fofn which contains movie.bas|bax.h5 files.
        ccs_fofn --- a fofn contains movie.ccs.h5 files.
        fasta_fofn --- a fofn contains movie.bax.h5.fasta files.
                     script/
        no_log_f --- DON'T write log to a log file.
        tmp_dir --- Write temporary files to tmp_dir (usually /scratch) for speed
        """
        self.prog_name = str(prog_name)
        self.root_dir = real_ppath(root_dir)
        self._tmp_dir = real_ppath(tmp_dir)

        self.bas_fofn = real_ppath(bas_fofn)
        self.ccs_fofn = real_ppath(ccs_fofn)
        self.fasta_fofn = real_ppath(fasta_fofn)

        if make_dirs is True:
            mkdir(self.root_dir)
            mkdir(self.tmp_dir)
            mkdir(self.log_dir)
            mkdir(self.script_dir)
            mkdir(self.out_dir)

        self.no_log_f = no_log_f
        if not no_log_f:
            self.log_f = open(self.log_fn, 'w', 0)
            self.add_log(msg="{p} initialized.".format(p=self.prog_name))
Пример #29
0
 def setUp(self):
     """Initialize."""
     self.data_dir = op.join(DATA_DIR, "test_daligner_against_ref")
     self.script_dir = op.join(OUT_DIR, "test_ice_daligner_script")
     self.dazz_dir = op.join(OUT_DIR, "test_ice_daligner_dazz")
     self.out_dir = op.join(OUT_DIR, "test_ice_daligner_out")
     mkdir (self.dazz_dir)
     mkdir (self.out_dir)
     self.stdout_dir = STD_DIR
     self.sivDataDir = SIV_DATA_DIR
     self.query_filename  = "test_daligner_query.fasta"
     self.target_filename = "test_daligner_target.fasta"
     self.runner = DalignerRunner(query_filename=op.join(self.data_dir, self.query_filename),
                                  target_filename=op.join(self.data_dir, self.target_filename),
                                  is_FL=False, same_strand_only=True,
                                  dazz_dir=self.dazz_dir, script_dir=self.script_dir)
     self.runner.output_dir = self.out_dir
Пример #30
0
    def __init__(self, root_dir, fasta_filenames, ref_fasta,
                 out_pickle, sge_opts, ccs_fofn=None, tmp_dir=None):
        """
        fasta_filenames --- a list of splitted nfl fasta files.

        ref_fasta --- (unpolished) consensus isoforms

        out_pickle --- a pickle file with all nfl fasta reads

        ccs_fofn --- should be reads_of_insert.fofn or None

        root_dir --- ICE root output directory

        tmp_dir --- if not None, write temporary clusters, dazz, las
                    files to the given temporaray directory

        sge_opts --- params for SGE environment, including
            use_sge    : use SGE or not
            max_sge_jobs: maximum number of sub-jobs submitted
            unique_id  : unique qsub job id, important that this
                        DOES NOT CONFLICT!
            blasr_nproc: blasr -nproc param, number of threads per cpu.

        """
        self.prog_name = "IceAllPartials"
        IceFiles.__init__(self, prog_name=self.prog_name, root_dir=root_dir, tmp_dir=tmp_dir)

        self.fasta_filenames, self.ref_fasta, self.ccs_fofn, = \
            self._validate_inputs(fasta_filenames=fasta_filenames,
                                  ref_fasta=ref_fasta,
                                  ccs_fofn=ccs_fofn)

        self.out_pickle = out_pickle

        self.sge_opts = sge_opts

        self.add_log("Making dir for mapping noFL reads: " + self.nfl_dir)
        mkdir(self.nfl_dir)

        self.add_log("input fasta files are: " +
                     ", ".join(self.fasta_filenames))
        self.add_log("temp pickle files are: " +
                     ", ".join(self.pickle_filenames))
        self.add_log("out pickle file is: " + self.out_pickle)
        self.add_log("temp directory is: " + str(self.tmp_dir))
Пример #31
0
 def _cp(task,
         new_task,
         copied_files,
         copy_consensus_isoforms=copy_consensus_isoforms,
         copy_flnc_pickle=copy_flnc_pickle,
         copy_nfl_pickle=copy_nfl_pickle):
     """Copy task.files to new_task.files."""
     if copy_consensus_isoforms is True and new_task.consensus_isoforms_file not in copied_files:
         shutil.copy(task.consensus_isoforms_file,
                     new_task.consensus_isoforms_file)
         copied_files[new_task.consensus_isoforms_file] = True
     if copy_flnc_pickle is True and new_task.flnc_pickle not in copied_files:
         mkdir(op.dirname(new_task.flnc_pickle))
         shutil.copy(task.flnc_pickle, new_task.flnc_pickle)
         copied_files[new_task.flnc_pickle] = True
     if copy_nfl_pickle is True and new_task.nfl_pickle not in copied_files:
         mkdir(op.dirname(new_task.nfl_pickle))
         shutil.copy(task.nfl_pickle, new_task.nfl_pickle)
         copied_files[new_task.nfl_pickle] = True
Пример #32
0
    def run(self):
        """ Call DalignerRunner """
        logging.info("Running {f} v{v}.".format(f=op.basename(__file__),
                                                v=self.getVersion()))
        args = self.args
        mkdir(args.output_dir)

        sge_opts = SgeOptions(unique_id=args.unique_id,
                              use_sge=args.use_sge,
                              max_sge_jobs=args.max_sge_jobs,
                              blasr_nproc=args.blasr_nproc,
                              sge_env_name=args.sge_env_name,
                              sge_queue=args.sge_queue)

        obj = DalignerRunner(query_filename=args.query_fasta,
                             target_filename=args.target_fasta,
                             is_FL=args.is_FL, same_strand_only=args.same_strand_only,
                             query_converted=False, target_converted=False,
                             use_sge=args.use_sge, sge_opts=sge_opts)
        obj.run(output_dir=args.output_dir)
Пример #33
0
    def run(self):
        """ Call DalignerRunner """
        logging.info("Running {f} v{v}.".format(f=op.basename(__file__),
                                                v=self.getVersion()))
        args = self.args
        mkdir(args.output_dir)

        sge_opts = SgeOptions(unique_id=args.unique_id,
                              use_sge=args.use_sge,
                              max_sge_jobs=args.max_sge_jobs,
                              blasr_nproc=args.blasr_nproc,
                              sge_env_name=args.sge_env_name,
                              sge_queue=args.sge_queue)

        obj = DalignerRunner(query_filename=args.query_fasta,
                             target_filename=args.target_fasta,
                             is_FL=args.is_FL, same_strand_only=args.same_strand_only,
                             query_converted=False, target_converted=False,
                             use_sge=args.use_sge, sge_opts=sge_opts)
        obj.run(output_dir=args.output_dir)
Пример #34
0
 def setUp(self):
     """Initialize."""
     self.data_dir = op.join(DATA_DIR, "test_daligner_against_ref")
     self.script_dir = op.join(OUT_DIR, "test_ice_daligner_script")
     self.dazz_dir = op.join(OUT_DIR, "test_ice_daligner_dazz")
     self.out_dir = op.join(OUT_DIR, "test_ice_daligner_out")
     mkdir(self.dazz_dir)
     mkdir(self.out_dir)
     self.stdout_dir = STD_DIR
     self.sivDataDir = SIV_DATA_DIR
     self.query_filename = "test_daligner_query.fasta"
     self.target_filename = "test_daligner_target.fasta"
     self.runner = DalignerRunner(
         query_filename=op.join(self.data_dir, self.query_filename),
         target_filename=op.join(self.data_dir, self.target_filename),
         is_FL=False,
         same_strand_only=True,
         dazz_dir=self.dazz_dir,
         script_dir=self.script_dir)
     self.runner.output_dir = self.out_dir
Пример #35
0
    def validate_inputs(self):
        """Validate input fofns, and root_dir, log_dir, tmp_dir,
        create arrowed_dir and arrowed_log_dir"""
        self.add_log("Validating inputs.")

        # Create directories: root_dir/quivered and root_dir/log_dir/quivered
        try:
            mkdir(self.arrowed_dir)
            mkdir(self.arrowed_log_dir)
        except OSError:
            # Multiple ice_arrow_i jobs may run at the same time and try to
            # mkdir, race condition may happen, so ignore OSError here.
            pass

        errMsg = ""

        if not nfs_exists(self.log_dir) or not op.isdir(self.log_dir):
            errMsg = "Log dir {l} is not an existing directory.".\
                format(l=self.log_dir)
        elif self.subread_xml is None:
            errMsg = "Please specify subreads XML (e.g., --subread_xml=<movie>.subreadset.xml)."
        elif not nfs_exists(self.subread_xml):
            errMsg = "Specified subreads file (subread_xml={f}) does not exist.".format(
                f=self.subread_xml)
        elif guess_file_format(self.subread_xml) is not FILE_FORMATS.BAM:
            errMsg = "Invalid subreads XML file: {0}!".format(self.subread_xml)
        elif not nfs_exists(self.nfl_all_pickle_fn):
            #"output/map_noFL/noFL.ALL.partial_uc.pickle"):
            errMsg = "Pickle file {f} ".format(f=self.nfl_all_pickle_fn) + \
                     "which assigns all non-full-length reads to isoforms " + \
                     "does not exist. Please check 'run_IcePartials2.py *' are " + \
                     "all done."
        elif not nfs_exists(self.final_pickle_fn):
            errMsg = "Pickle file {f} ".format(f=self.final_pickle_fn) + \
                     "which assigns full-length non-chimeric reads to " + \
                     "isoforms does not exist."

        if errMsg != "":
            self.add_log(errMsg, level=logging.ERROR)
            raise IOError(errMsg)
Пример #36
0
    def __init__(self, flnc_filename, root_dir, out_pickle, output_basename):
        """
        Reads in input flnc file will be separated into multiple categories
        according to separation criterion, and reads in each category will
        be written into
            <root_dir>/<separation_criteria>/<output_basename>.fasta|contigset.xml

        e.g., if reads are separated by primers, then reads will be written to
        <root_dir>/<primer*>/<output_basename>.fasta|contigset.xml

        Parameters:
          flnc_filename - input full length non-chimeric reads in FASTA or CONTIGSET
          root_dir - output root directory
          output_basename - output file basename
        """
        self.flnc_filename = flnc_filename
        self.root_dir = realpath(root_dir)
        mkdir(root_dir)
        self.output_basename = output_basename
        self.create_contigset = True if flnc_filename.endswith(".xml") else False
        self.handles = {} # key --> fasta file handler
        self.out_pickle = out_pickle if out_pickle is not None \
                          else op.join(self.root_dir, "separate_flnc.pickle")
Пример #37
0
    def __init__(self, flnc_filename, root_dir, out_pickle, output_basename):
        """
        Reads in input flnc file will be separated into multiple categories
        according to separation criterion, and reads in each category will
        be written into
            <root_dir>/<separation_criteria>/<output_basename>.fasta|contigset.xml

        e.g., if reads are separated by primers, then reads will be written to
        <root_dir>/<primer*>/<output_basename>.fasta|contigset.xml

        Parameters:
          flnc_filename - input full length non-chimeric reads in FASTA or CONTIGSET
          root_dir - output root directory
          output_basename - output file basename
        """
        self.flnc_filename = flnc_filename
        self.root_dir = realpath(root_dir)
        mkdir(root_dir)
        self.output_basename = output_basename
        self.create_contigset = True if flnc_filename.endswith(".xml") else False
        self.handles = {} # key --> fasta file handler
        self.out_pickle = out_pickle if out_pickle is not None \
                          else op.join(self.root_dir, "separate_flnc.pickle")
Пример #38
0
    def reconstruct_ref_fa_for_clusters_in_bin(self, cids, refs):
        """
        Reconstruct ref_fa of the cluster in the new tmp_dir
        e.g.,
            self.g_consensus_ref_fa_of_cluster(cid)

        Liz: new cids after ice2 collection is b<bin>_c<cid>
        refs --- dict{int(cid): ref_fa of cluster(cid)}
        """
        # Check existence when first time it is read.
        if not nfs_exists(self.final_consensus_fa):
            raise IOError("Final consensus FASTA file {f}".format(
                f=self.final_consensus_fa) + "does not exist.")

        print("Reconstructing g consensus files for clusters {0}, {1} in {2}".
              format(cids[0], cids[-1], self.tmp_dir))
        self.add_log(
            "Reconstructing g consensus files for clusters {0}, {1} in {2}".
            format(cids[0], cids[-1], self.tmp_dir))

        final_consensus_d = FastaRandomReader(self.final_consensus_fa)
        for ref_id in list(final_consensus_d.d.keys()):
            # Liz: this is no longer valid for the Ice2 cids #cid = int(ref_id.split('/')[0].replace('c', ''))
            cid = ref_id
            if cid in cids:
                _dir = self.cluster_dir_for_reconstructed_ref(cid)
                mkdir(_dir)
                ref_fa = op.join(_dir, op.basename(refs[cid]))
                refs[cid] = ref_fa
                with FastaWriter(ref_fa) as writer:
                    self.add_log("Writing ref_fa %s" % refs[cid])
                    writer.writeRecord(ref_id,
                                       final_consensus_d[ref_id].sequence[:])

        self.add_log("Reconstruct of g consensus files completed.",
                     level=logging.INFO)
Пример #39
0
    def run(self, output_dir='.', min_match_len=300, sensitive_mode=False):
        """
        if self.use_sge --- writes to <scripts>/daligner_job_#.sh
        else --- run locally, dividing into self.cpus/4 tasks (capped max at 4)

        NOTE 1: when using SGE, be careful that multiple calls to this might
        end up writing to the SAME job.sh files, this should be avoided by
        changing <scripts> directory

        NOTE 2: more commonly this should be invoked locally
        (since ice_partial.py i/one be qsub-ed),
        in that case it is more recommended to keep self.cpus = 4 so that
        each daligner job is run consecutively and that the original qsub job
        should have been called with qsub -pe smp 4 (set by --blasr_nproc 4)
        In this way, the daligner jobs are called consecutively, but LA4Ice
        is parallelized 4X
        """
        self.output_dir = realpath(output_dir) # Reset output_dir
        old_dir = realpath(op.curdir)
        mkdir(output_dir)
        os.chdir(output_dir)

        if self.use_sge:
            mknewdir(self.script_dir)

        # prepare done scripts is no longer necessary.
        #self.write_daligner_done_script()
        #self.write_la4ice_done_script()

        # (a) run all daligner jobs
        daligner_cmds = self.daligner_cmds(min_match_len=min_match_len,
                                           sensitive_mode=sensitive_mode)

        logging.info("Start daligner cmds " +
                     ("using sge." if self.use_sge else "locally."))
        logging.debug("CMD: " + "\n".join(daligner_cmds))

        start_t = time.time()
        failed = []
        if self.use_sge:
            failed.extend(
                sge_job_runner(cmds_list=daligner_cmds,
                               script_files=self.daligner_scripts,
                               #done_script=self.daligner_done_script,
                               num_threads_per_job=DALIGNER_NUM_THREADS,
                               sge_opts=self.sge_opts, qsub_try_times=3,
                               wait_timeout=600, run_timeout=600,
                               rescue="sge", rescue_times=3))
        else:
            # max 4 at a time to avoid running out of memory...
            failed.extend(
                local_job_runner(cmds_list=daligner_cmds,
                                 num_threads=max(1, min(self.cpus/4, 4))))
        logging.info("daligner jobs took " + str(time.time()-start_t) + " sec.")

        # (b) run all LA4Ice jobs
        start_t = time.time()
        logging.info("Start LA4Ice cmds " +
                     ("using sge." if self.use_sge else "locally."))
        la4ice_cmds = self.la4ice_cmds
        logging.debug("CMD: " + "\n".join(la4ice_cmds))

        if self.use_sge:
            failed.extend(
                sge_job_runner(cmds_list=la4ice_cmds,
                               script_files=self.la4ice_scripts,
                               #done_script=self.la4ice_done_script,
                               num_threads_per_job=DALIGNER_NUM_THREADS,
                               sge_opts=self.sge_opts, qsub_try_times=3,
                               wait_timeout=600, run_timeout=600,
                               rescue="sge", rescue_times=3))
        else:
            # max 4 at a time to avoid running out of memory...
            failed.extend(
                local_job_runner(cmds_list=la4ice_cmds,
                                 num_threads=max(1, min(self.cpus, 4))))
        logging.info("LA4Ice jobs took " + str(time.time()-start_t) + " sec.")
        os.chdir(old_dir)

        if len(failed) == 0:
            return 0
        else:
            raise RuntimeError("%s.run failed, %s." %
                               (op.basename(self.__class__),
                                "\n".join([x[0] for x in failed])))
Пример #40
0
def args_runner(args):
    """args runner"""
    logging.info("%s arguments are:\n%s\n", __file__, args)

    # sanity check arguments
    _sanity_check_args(args)

    # make option objects
    ice_opts = IceOptions(quiver=args.quiver, use_finer_qv=args.use_finer_qv,
                          targeted_isoseq=args.targeted_isoseq,
                          ece_penalty=args.ece_penalty, ece_min_len=args.ece_min_len,
                          nfl_reads_per_split=args.nfl_reads_per_split)
    sge_opts = SgeOptions(unique_id=args.unique_id, use_sge=args.use_sge,
                          max_sge_jobs=args.max_sge_jobs, blasr_nproc=args.blasr_nproc,
                          quiver_nproc=args.quiver_nproc, gcon_nproc=args.gcon_nproc,
                          sge_env_name=args.sge_env_name, sge_queue=args.sge_queue)
    ipq_opts = IceQuiverHQLQOptions(qv_trim_5=args.qv_trim_5, qv_trim_3=args.qv_trim_3,
                                    hq_quiver_min_accuracy=args.hq_quiver_min_accuracy)

    # (1) separate flnc reads into bins
    logging.info("Separating FLNC reads into bins.")
    tofu_f = TofuFiles(tofu_dir=args.tofu_dir)
    s = SeparateFLNCRunner(flnc_fa=args.flnc_fa, root_dir=args.tofu_dir,
                           out_pickle=tofu_f.separate_flnc_pickle,
                           bin_size_kb=args.bin_size_kb, bin_by_primer=args.bin_by_primer,
                           bin_manual=args.bin_manual, max_base_limit_MB=args.max_base_limit_MB)
    s.run()

    flnc_files = SeparateFLNCBase.convert_pickle_to_sorted_flnc_files(tofu_f.separate_flnc_pickle)
    logging.info("Separated FLNC reads bins are %s", flnc_files)

    # (2) apply 'pbtranscript cluster' to each bin
    # run ICE/Quiver (the whole thing), providing the fasta_fofn
    logging.info("Running ICE/Polish on separated FLNC reads bins.")
    split_dirs = []
    for flnc_file in flnc_files:
        split_dir = op.join(realpath(op.dirname(flnc_file)), "cluster_out")
        mkdir(split_dir)
        split_dirs.append(split_dir)
        cur_out_cons = op.join(split_dir, "consensus_isoforms.fasta")

        ipq_f = IceQuiverPostprocess(root_dir=split_dir, ipq_opts=ipq_opts)
        if op.exists(ipq_f.quivered_good_fq):
            logging.warning("HQ polished isoforms %s already exist. SKIP!", ipq_f.quivered_good_fq)
            continue
        else:
            logging.info("Running ICE/Quiver on %s", split_dir)
            rmpath(cur_out_cons)

        obj = Cluster(root_dir=split_dir, flnc_fa=flnc_file,
                      nfl_fa=args.nfl_fa,
                      bas_fofn=args.bas_fofn,
                      ccs_fofn=args.ccs_fofn,
                      fasta_fofn=args.fasta_fofn,
                      out_fa=cur_out_cons, sge_opts=sge_opts,
                      ice_opts=ice_opts, ipq_opts=ipq_opts)

        if args.mem_debug: # DEBUG
            from memory_profiler import memory_usage
            start_t = time.time()
            mem_usage = memory_usage(obj.run, interval=60)
            end_t = time.time()
            with open('mem_debug.log', 'a') as f:
                f.write("Running ICE/Quiver on {0} took {1} secs.\n".format(split_dir,
                                                                            end_t-start_t))
                f.write("Maximum memory usage: {0}\n".format(max(mem_usage)))
                f.write("Memory usage: {0}\n".format(mem_usage))
        else:
            obj.run()

        if not args.keep_tmp_files: # by deafult, delete all tempory files.
            logging.info("Deleting %s", ipq_f.tmp_dir)
            subprocess.Popen(['rm', '-rf', '%s' % ipq_f.tmp_dir])
            logging.info("Deleting %s", ipq_f.quivered_dir)
            subprocess.Popen(['rm', '-rf', '%s' % ipq_f.quivered_dir])

    # (3) merge polished isoform cluster from all bins
    logging.info("Merging isoforms from all bins to %s.", tofu_f.combined_dir)
    c = CombineRunner(combined_dir=tofu_f.combined_dir,
                      sample_name=get_sample_name(args.sample_name),
                      split_dirs=split_dirs, ipq_opts=ipq_opts)
    c.run()
    if args.summary_fn is not None:
        ln(tofu_f.all_cluster_summary_fn, args.summary_fn)
    if args.report_fn is not None:
        ln(tofu_f.all_cluster_report_fn, args.report_fn)

    # (4) map HQ isoforms to GMAP reference genome
    map_isoforms_and_sort(input_filename=tofu_f.all_hq_fq, sam_filename=tofu_f.sorted_gmap_sam,
                          gmap_db_dir=args.gmap_db, gmap_db_name=args.gmap_name,
                          gmap_nproc=args.gmap_nproc)

    # (5) post mapping to genome analysis, including
    #     * collapse polished HQ isoform clusters into groups
    #     * count abundance of collapsed isoform groups
    #     * filter collapsed isoforms based on abundance info
    logging.info("Post mapping to genome analysis.")
    out_isoforms = args.collapsed_filtered_fn
    if any(out_isoforms.endswith(ext) for ext in (".fa", ".fasta")):
        in_isoforms = tofu_f.all_hq_fa
    elif any(out_isoforms.endswith(ext) for ext in (".fq", ".fastq")):
        in_isoforms = tofu_f.all_hq_fq
    else:
        raise ValueError("Output file %s must be FASTA or FASTQ!" % out_isoforms)

    post_mapping_to_genome_runner(
        in_isoforms=in_isoforms, in_sam=tofu_f.sorted_gmap_sam,
        in_pickle=tofu_f.hq_lq_prefix_dict_pickle, out_isoforms=args.collapsed_filtered_fn,
        out_gff=args.gff_fn, out_abundance=args.abundance_fn,
        out_group=args.group_fn, out_read_stat=args.read_stat_fn,
        min_aln_coverage=args.min_aln_coverage, min_aln_identity=args.min_aln_identity,
        min_flnc_coverage=args.min_flnc_coverage, max_fuzzy_junction=args.max_fuzzy_junction,
        allow_extra_5exon=args.allow_extra_5exon, min_count=args.min_count)

    return 0
Пример #41
0
 def __init__(self, combined_dir):
     self.combined_dir = realpath(combined_dir)
     mkdir(self.combined_dir)
Пример #42
0
    def run(self, output_dir='.', min_match_len=300, sensitive_mode=False):
        """
        if self.use_sge --- writes to <scripts>/daligner_job_#.sh
        else --- run locally, dividing into self.cpus/4 tasks (capped max at 4)

        NOTE 1: when using SGE, be careful that multiple calls to this might
        end up writing to the SAME job.sh files, this should be avoided by
        changing <scripts> directory

        NOTE 2: more commonly this should be invoked locally
        (since ice_partial.py i/one be qsub-ed),
        in that case it is more recommended to keep self.cpus = 4 so that
        each daligner job is run consecutively and that the original qsub job
        should have been called with qsub -pe smp 4 (set by --blasr_nproc 4)
        In this way, the daligner jobs are called consecutively, but LA4Ice
        is parallelized 4X
        """
        self.output_dir = realpath(output_dir) # Reset output_dir
        old_dir = realpath(op.curdir)
        mkdir(output_dir)
        os.chdir(output_dir)

        if self.use_sge:
            mknewdir(self.script_dir)

        # prepare done scripts is no longer necessary.
        #self.write_daligner_done_script()
        #self.write_la4ice_done_script()

        # (a) run all daligner jobs
        daligner_cmds = self.daligner_cmds(min_match_len=min_match_len,
                                           sensitive_mode=sensitive_mode)

        logging.info("Start daligner cmds " +
                     ("using sge." if self.use_sge else "locally."))
        logging.debug("CMD: " + "\n".join(daligner_cmds))

        start_t = time.time()
        failed = []
        if self.use_sge:
            failed.extend(
                sge_job_runner(cmds_list=daligner_cmds,
                               script_files=self.daligner_scripts,
                               #done_script=self.daligner_done_script,
                               num_threads_per_job=DALIGNER_NUM_THREADS,
                               sge_opts=self.sge_opts, qsub_try_times=3,
                               wait_timeout=600, run_timeout=600,
                               rescue="sge", rescue_times=3))
        else:
            # max 4 at a time to avoid running out of memory...
            failed.extend(
                local_job_runner(cmds_list=daligner_cmds,
                                 num_threads=max(1, min(self.cpus/4, 4))))
        logging.info("daligner jobs took " + str(time.time()-start_t) + " sec.")

        # (b) run all LA4Ice jobs
        start_t = time.time()
        logging.info("Start LA4Ice cmds " +
                     ("using sge." if self.use_sge else "locally."))
        la4ice_cmds = self.la4ice_cmds
        logging.debug("CMD: " + "\n".join(la4ice_cmds))

        if self.use_sge:
            failed.extend(
                sge_job_runner(cmds_list=la4ice_cmds,
                               script_files=self.la4ice_scripts,
                               #done_script=self.la4ice_done_script,
                               num_threads_per_job=DALIGNER_NUM_THREADS,
                               sge_opts=self.sge_opts, qsub_try_times=3,
                               wait_timeout=600, run_timeout=600,
                               rescue="sge", rescue_times=3))
        else:
            # max 4 at a time to avoid running out of memory...
            failed.extend(
                local_job_runner(cmds_list=la4ice_cmds,
                                 num_threads=max(1, min(self.cpus, 4))))
        logging.info("LA4Ice jobs took " + str(time.time()-start_t) + " sec.")
        os.chdir(old_dir)

        if len(failed) == 0:
            return 0
        else:
            raise RuntimeError("%s.run failed, %s." %
                               (op.basename(self.__class__),
                                "\n".join([x[0] for x in failed])))
Пример #43
0
def resolved_tool_contract_runner(rtc):
    """
    For each cluster bin, create summary.json, cluster_report.csv,
    hq_isoforms.fa|fq, lq_isoforms.fa|fq
    Finally, merge all cluster bins and save all outputs to 'combined'.
    """
    p = ChunkTasksPickle.read(rtc.task.input_files[0])
    assert all([isinstance(task, ClusterChunkTask) for task in p])
    p.sorted_by_attr(attr='cluster_bin_index')

    opts = rtc.task.options
    ipq_opts = IceQuiverHQLQOptions(
        qv_trim_5=opts[Constants.QV_TRIM_FIVEPRIME_ID],
        qv_trim_3=opts[Constants.QV_TRIM_THREEPRIME_ID],
        hq_quiver_min_accuracy=opts[Constants.HQ_QUIVER_MIN_ACCURACY_ID])
    sample_name = get_sample_name(
        input_sample_name=opts[Constants.SAMPLE_NAME_ID])

    out_consensus_isoforms_cs = rtc.task.output_files[0]
    out_summary = rtc.task.output_files[1]
    out_report = rtc.task.output_files[2]
    out_hq_cs = rtc.task.output_files[3]
    out_hq_fq = rtc.task.output_files[4]
    out_lq_cs = rtc.task.output_files[5]
    out_lq_fq = rtc.task.output_files[6]
    out_hq_lq_prefix_dict_pickle = rtc.task.output_files[7]

    assert out_consensus_isoforms_cs.endswith(".contigset.xml")
    assert out_hq_cs.endswith(".contigset.xml")
    assert out_lq_cs.endswith(".contigset.xml")
    out_consensus_isoforms_fa = out_consensus_isoforms_cs.replace(
        ".contigset.xml", ".fasta")
    out_hq_fa = out_hq_cs.replace('.contigset.xml', '.fasta')
    out_lq_fa = out_lq_cs.replace('.contigset.xml', '.fasta')

    hq_fq_fns, lq_fq_fns = [], []
    split_uc_pickles, split_partial_uc_pickles = [], []
    split_consensus_isoforms = []

    cluster_bin_indices = [task.cluster_bin_index for task in p]
    cluster_out_dirs = [task.cluster_out_dir for task in p]
    # sanity check that Cluster indices are unique!
    assert len(set(cluster_bin_indices)) == len(cluster_bin_indices)

    for task in p:
        ice_pq = IceQuiverPostprocess(root_dir=task.cluster_out_dir,
                                      ipq_opts=ipq_opts)
        hq_fq_fns.append(ice_pq.quivered_good_fq)
        lq_fq_fns.append(ice_pq.quivered_bad_fq)
        split_uc_pickles.append(ice_pq.final_pickle_fn)
        split_partial_uc_pickles.append(ice_pq.nfl_all_pickle_fn)
        split_consensus_isoforms.append(ice_pq.final_consensus_fa)

    combined_dir = op.join(op.dirname(op.dirname(cluster_out_dirs[0])),
                           "combined")
    mkdir(combined_dir)
    combined_files = CombinedFiles(combined_dir)
    log.info("Combining results of all cluster bins to %s.", combined_dir)
    log.info("Merging HQ|LQ isoforms from all cluster bins.")
    log.info("HQ isoforms are: %s.", ",".join(hq_fq_fns))
    log.info("LQ isoforms are: %s.", ",".join(lq_fq_fns))
    combine_polished_isoforms(
        split_indices=cluster_bin_indices,
        split_hq_fns=hq_fq_fns,
        split_lq_fns=lq_fq_fns,
        combined_hq_fa=combined_files.all_hq_fa,
        combined_hq_fq=combined_files.all_hq_fq,
        combined_lq_fa=combined_files.all_lq_fa,
        combined_lq_fq=combined_files.all_lq_fq,
        hq_lq_prefix_dict_pickle=combined_files.hq_lq_prefix_dict_pickle,
        sample_name=sample_name)

    ln(combined_files.all_hq_fa, out_hq_fa)  #'HQ isoforms'
    ln(combined_files.all_hq_fq, out_hq_fq)  #'HQ isoforms'
    ln(combined_files.all_lq_fa, out_lq_fa)  #'LQ isoforms'
    ln(combined_files.all_lq_fq, out_lq_fq)  #'LQ isoforms'
    ln(combined_files.hq_lq_prefix_dict_pickle, out_hq_lq_prefix_dict_pickle)

    as_contigset(out_hq_fa, out_hq_cs)
    as_contigset(out_lq_fa, out_lq_cs)

    log.info("Merging consensus isoforms from all cluster bins.")
    combine_consensus_isoforms(split_indices=cluster_bin_indices,
                               split_files=split_consensus_isoforms,
                               combined_consensus_isoforms_fa=combined_files.
                               all_consensus_isoforms_fa,
                               sample_name=sample_name)
    ln(combined_files.all_consensus_isoforms_fa, out_consensus_isoforms_fa)
    #consensus isoforms
    as_contigset(out_consensus_isoforms_fa, out_consensus_isoforms_cs)

    log.info("Writing cluster summary to %s",
             combined_files.all_cluster_summary_fn)
    write_cluster_summary(summary_fn=combined_files.all_cluster_summary_fn,
                          isoforms_fa=out_consensus_isoforms_cs,
                          hq_fa=out_hq_fa,
                          lq_fa=out_lq_fa)
    ln(combined_files.all_cluster_summary_fn, out_summary)  # "cluster summary"

    log.info("Writing cluster report to %s",
             combined_files.all_cluster_report_fn)
    write_combined_cluster_report(
        split_indices=cluster_bin_indices,
        split_uc_pickles=split_uc_pickles,
        split_partial_uc_pickles=split_partial_uc_pickles,
        report_fn=combined_files.all_cluster_report_fn,
        sample_name=sample_name)
    ln(combined_files.all_cluster_report_fn, out_report)  # "cluster report"
def make_pickle(in_pickle, out_pickle, root_dir,
                copy_consensus_isoforms=False,
                copy_flnc_pickle=False,
                copy_nfl_pickle=False,
                copy_quivered=False):
    """
    Copy cluster_out_dir in in_pickle to {root_dir}/bin_name/cluster_out/
    """
    mkdir(root_dir)
    def make_flnc(in_flnc, root_dir):
        bin_name = op.basename(op.dirname(in_flnc))
        flnc_name = op.basename(in_flnc)

        assert in_flnc.endswith(".contigset.xml")
        in_flnc_fa = in_flnc.replace(".contigset.xml", ".fasta")
        new_flnc = op.join(root_dir, bin_name, flnc_name)
        new_flnc_fa = new_flnc.replace(".contigset.xml", ".fasta")

        print "new_flnc = %s" % new_flnc
        shutil.copy(in_flnc_fa, new_flnc_fa)
        as_contigset(new_flnc_fa, new_flnc)

    def make_cluster_out_dir(in_dir, root_dir):
        bin_name = op.basename(op.dirname(in_dir))
        new_dir = op.join(root_dir, bin_name, "cluster_out") #e.g., root_dir/0to1kb_part0/cluster_out
        mkdir(new_dir)
        return new_dir

    def _cp(task, new_task, copied_files, copy_consensus_isoforms=copy_consensus_isoforms,
            copy_flnc_pickle=copy_flnc_pickle, copy_nfl_pickle=copy_nfl_pickle):
        """Copy task.files to new_task.files."""
        if copy_consensus_isoforms is True and new_task.consensus_isoforms_file not in copied_files:
            shutil.copy(task.consensus_isoforms_file, new_task.consensus_isoforms_file)
            copied_files[new_task.consensus_isoforms_file] = True
        if copy_flnc_pickle is True and new_task.flnc_pickle not in copied_files:
            mkdir(op.dirname(new_task.flnc_pickle))
            shutil.copy(task.flnc_pickle, new_task.flnc_pickle)
            copied_files[new_task.flnc_pickle] = True
        if copy_nfl_pickle is True and new_task.nfl_pickle not in copied_files:
            mkdir(op.dirname(new_task.nfl_pickle))
            shutil.copy(task.nfl_pickle, new_task.nfl_pickle)
            copied_files[new_task.nfl_pickle] = True


    print "making pickle from in_pickle %s to out_pickle %s, root_dir %s" % \
            (in_pickle, out_pickle, root_dir)

    p = ChunkTasksPickle.read(in_pickle)
    assert len(p) > 0
    if all([isinstance(task, ClusterChunkTask) for task in p]):
        outp = ChunkTasksPickle()
        copied_files = dict()
        for task in p:
            cluster_out_dir = make_cluster_out_dir(task.cluster_out_dir, root_dir)
            print "new_cluster_out_dir is %s" % cluster_out_dir
            #flnc_file = make_flnc(task.flnc_file)
            new_task = ClusterChunkTask(task.cluster_bin_index,
                                        task.flnc_file,
                                        cluster_out_dir)
            _cp(task=task, new_task=new_task, copied_files=copied_files,
                copy_consensus_isoforms=copy_consensus_isoforms,
                copy_flnc_pickle=copy_flnc_pickle, copy_nfl_pickle=copy_nfl_pickle)
            outp.append(new_task)
        outp.write(out_pickle)
    elif all([isinstance(task, PartialChunkTask) for task in p]):
        outp = ChunkTasksPickle()
        copied_files = dict()
        for task in p:
            cluster_out_dir = make_cluster_out_dir(task.cluster_out_dir, root_dir)
            print "new_cluster_out_dir is %s" % cluster_out_dir
            #flnc_file = make_flnc(task.flnc_file)
            new_task = PartialChunkTask(task.cluster_bin_index,
                                        task.flnc_file,
                                        cluster_out_dir,
                                        nfl_file=task.nfl_file,
                                        nfl_index=task.nfl_index,
                                        n_nfl_chunks=task.n_nfl_chunks)
            _cp(task=task, new_task=new_task, copied_files=copied_files,
                copy_consensus_isoforms=copy_consensus_isoforms,
                copy_flnc_pickle=copy_flnc_pickle, copy_nfl_pickle=copy_nfl_pickle)
            outp.append(new_task)
        outp.write(out_pickle)
    elif all([isinstance(task, PolishChunkTask) for task in p]):
        outp = ChunkTasksPickle()
        copied_files = dict()
        for task in p:
            cluster_out_dir = make_cluster_out_dir(task.cluster_out_dir, root_dir)
            print "new_cluster_out_dir is %s" % cluster_out_dir
            #flnc_file = make_flnc(task.flnc_file)
            new_task = PolishChunkTask(task.cluster_bin_index,
                                       task.flnc_file,
                                       cluster_out_dir,
                                       polish_index=task.polish_index,
                                       n_polish_chunks=task.n_polish_chunks)
            mkdir(op.dirname(new_task.nfl_pickle))
            # always copy nfl_pickle for PolishChunkTask
            assert copy_nfl_pickle is True

            _cp(task=task, new_task=new_task, copied_files=copied_files,
                copy_consensus_isoforms=copy_consensus_isoforms,
                copy_flnc_pickle=copy_flnc_pickle, copy_nfl_pickle=copy_nfl_pickle)
            dst_dir =op.join(cluster_out_dir, "quivered")
            if copy_quivered is True and dst_dir not in copied_files:
                if op.exists(dst_dir):
                    shutil.rmtree(dst_dir)
                shutil.copytree(op.join(task.cluster_out_dir, "quivered"), dst_dir)
                copied_files[dst_dir] = True
            outp.append(new_task)
        outp.write(out_pickle)
    else:
        assert False
 def make_cluster_out_dir(in_dir, root_dir):
     bin_name = op.basename(op.dirname(in_dir))
     new_dir = op.join(root_dir, bin_name, "cluster_out") #e.g., root_dir/0to1kb_part0/cluster_out
     mkdir(new_dir)
     return new_dir
"""Test pbtranscript.collapsing.Branch."""
import unittest
import os.path as op
import cPickle
import filecmp
import numpy as np
from pbtranscript.Utils import rmpath, mkdir
from pbtranscript.tasks.map_isoforms_to_genome import gmap_db_and_name_from_ds
from test_setpath import DATA_DIR, OUT_DIR, SIV_DATA_DIR, SIV_STD_DIR


READS_DS = op.join(SIV_DATA_DIR, 'test_collapsing', 'gmap-input.fastq.contigset.xml')
GMAP_DS = op.join(SIV_DATA_DIR, "gmap-referenceset-root-dir/SIRV/gmapreferenceset.xml")
_OUT_DIR_ = op.join(OUT_DIR, "test_map_isoforms_to_genome")
rmpath(_OUT_DIR_)
mkdir(_OUT_DIR_)


class TEST_map_isoforms_to_genome(unittest.TestCase):
    """Test functions of pbtranscript.tasks.map_isoforms_to_genome."""
    def setUp(self):
        """Define input and output file."""

    def test_gmap_db_and_name_from_ds(self):
        """Test map_isoforms_to_genome.gmap_db_and_name_from_ds"""
        gmap_db, gmap_name = gmap_db_and_name_from_ds(GMAP_DS)
        self.assertEqual(gmap_db, op.join(SIV_DATA_DIR, "gmap-referenceset-root-dir", "SIRV"))
        self.assertEqual(gmap_name, "gmap_db")

Пример #47
0
def resolved_tool_contract_runner(rtc):
    """
    For each cluster bin, create summary.json, cluster_report.csv,
    hq_isoforms.fa|fq, lq_isoforms.fa|fq
    Finally, merge all cluster bins and save all outputs to 'combined'.
    """
    p = ChunkTasksPickle.read(rtc.task.input_files[0])
    assert all([isinstance(task, ClusterChunkTask) for task in p])
    p.sorted_by_attr(attr='cluster_bin_index')

    opts = rtc.task.options
    ipq_opts = IceQuiverHQLQOptions(qv_trim_5=opts[Constants.QV_TRIM_FIVEPRIME_ID],
                                    qv_trim_3=opts[Constants.QV_TRIM_THREEPRIME_ID],
                                    hq_quiver_min_accuracy=opts[Constants.HQ_QUIVER_MIN_ACCURACY_ID])
    sample_name = get_sample_name(input_sample_name=opts[Constants.SAMPLE_NAME_ID])

    out_consensus_isoforms_cs = rtc.task.output_files[0]
    out_summary = rtc.task.output_files[1]
    out_report = rtc.task.output_files[2]
    out_hq_cs = rtc.task.output_files[3]
    out_hq_fq = rtc.task.output_files[4]
    out_lq_cs = rtc.task.output_files[5]
    out_lq_fq = rtc.task.output_files[6]
    out_hq_lq_prefix_dict_pickle = rtc.task.output_files[7]

    assert out_consensus_isoforms_cs.endswith(".contigset.xml")
    assert out_hq_cs.endswith(".contigset.xml")
    assert out_lq_cs.endswith(".contigset.xml")
    out_consensus_isoforms_fa = out_consensus_isoforms_cs.replace(".contigset.xml", ".fasta")
    out_hq_fa = out_hq_cs.replace('.contigset.xml', '.fasta')
    out_lq_fa = out_lq_cs.replace('.contigset.xml', '.fasta')

    hq_fq_fns, lq_fq_fns = [], []
    split_uc_pickles, split_partial_uc_pickles = [], []
    split_consensus_isoforms = []

    cluster_bin_indices = [task.cluster_bin_index for task in p]
    cluster_out_dirs = [task.cluster_out_dir for task in p]
    # sanity check that Cluster indices are unique!
    assert len(set(cluster_bin_indices)) == len(cluster_bin_indices)

    for task in p:
        ice_pq = IceQuiverPostprocess(root_dir=task.cluster_out_dir,
                                      ipq_opts=ipq_opts)
        hq_fq_fns.append(ice_pq.quivered_good_fq)
        lq_fq_fns.append(ice_pq.quivered_bad_fq)
        split_uc_pickles.append(ice_pq.final_pickle_fn)
        split_partial_uc_pickles.append(ice_pq.nfl_all_pickle_fn)
        split_consensus_isoforms.append(ice_pq.final_consensus_fa)

    combined_dir = op.join(op.dirname(op.dirname(cluster_out_dirs[0])), "combined")
    mkdir(combined_dir)
    combined_files = CombinedFiles(combined_dir)
    log.info("Combining results of all cluster bins to %s.", combined_dir)
    log.info("Merging HQ|LQ isoforms from all cluster bins.")
    log.info("HQ isoforms are: %s.", ",".join(hq_fq_fns))
    log.info("LQ isoforms are: %s.", ",".join(lq_fq_fns))
    combine_polished_isoforms(split_indices=cluster_bin_indices,
                              split_hq_fns=hq_fq_fns,
                              split_lq_fns=lq_fq_fns,
                              combined_hq_fa=combined_files.all_hq_fa,
                              combined_hq_fq=combined_files.all_hq_fq,
                              combined_lq_fa=combined_files.all_lq_fa,
                              combined_lq_fq=combined_files.all_lq_fq,
                              hq_lq_prefix_dict_pickle=combined_files.hq_lq_prefix_dict_pickle,
                              sample_name=sample_name)

    ln(combined_files.all_hq_fa, out_hq_fa) #'HQ isoforms'
    ln(combined_files.all_hq_fq, out_hq_fq) #'HQ isoforms'
    ln(combined_files.all_lq_fa, out_lq_fa) #'LQ isoforms'
    ln(combined_files.all_lq_fq, out_lq_fq) #'LQ isoforms'
    ln(combined_files.hq_lq_prefix_dict_pickle, out_hq_lq_prefix_dict_pickle)

    as_contigset(out_hq_fa, out_hq_cs)
    as_contigset(out_lq_fa, out_lq_cs)

    log.info("Merging consensus isoforms from all cluster bins.")
    combine_consensus_isoforms(split_indices=cluster_bin_indices,
                               split_files=split_consensus_isoforms,
                               combined_consensus_isoforms_fa=combined_files.all_consensus_isoforms_fa,
                               sample_name=sample_name)
    ln(combined_files.all_consensus_isoforms_fa, out_consensus_isoforms_fa)
    #consensus isoforms
    as_contigset(out_consensus_isoforms_fa, out_consensus_isoforms_cs)

    log.info("Writing cluster summary to %s", combined_files.all_cluster_summary_fn)
    write_cluster_summary(summary_fn=combined_files.all_cluster_summary_fn,
                          isoforms_fa=out_consensus_isoforms_cs,
                          hq_fa=out_hq_fa,
                          lq_fa=out_lq_fa)
    ln(combined_files.all_cluster_summary_fn, out_summary) # "cluster summary"

    log.info("Writing cluster report to %s", combined_files.all_cluster_report_fn)
    write_combined_cluster_report(split_indices=cluster_bin_indices,
                                  split_uc_pickles=split_uc_pickles,
                                  split_partial_uc_pickles=split_partial_uc_pickles,
                                  report_fn=combined_files.all_cluster_report_fn,
                                  sample_name=sample_name)
    ln(combined_files.all_cluster_report_fn, out_report) # "cluster report"
Пример #48
0
 def setUp(self):
     """Define input and output file."""
     rmpath(_OUT_DIR_)
     mkdir(_OUT_DIR_)
     self.gmap_db_dir = op.join(_OUT_DIR_, 'gmap db dir')
     os.symlink(GMAP_DB, self.gmap_db_dir)
Пример #49
0
 def setUp(self):
     """Define input and output file."""
     rmpath(_OUT_DIR_)
     mkdir(_OUT_DIR_)
Пример #50
0
    def run(self):
        """
        For each cluster bin, create summary.json, cluster_report.csv,
        hq_isoforms.fa|fq, lq_isoforms.fa|fq
        Finally, merge all cluster bins and save all outputs to 'combined'.
        """
        logging.info("Running {f} v{v}.".format(f=op.basename(__file__),
                                                v=self.getVersion()))
        args = self.args

        # Get cluster bins directories as input
        cluster_bin_dirs = self.get_cluster_bin_dirs(separate_flnc_pickle=args.separate_flnc_pickle,
                                                     cluster_bin_dirs=args.cluster_bin_dirs)
        cluster_bin_indices = range(0, len(cluster_bin_dirs))

        # Create output dir
        combined_dir = args.combined_dir
        mkdir(combined_dir)

        # Get combined output filenames
        def f(input_fn, default_fn):
            if input_fn is None:
                return op.join(combined_dir, default_fn)

        out_consensus_isoforms_fa = f(args.consensus_isoforms_fa, "all.consensus_isoforms.fasta")
        out_summary = f(args.summary_fn, "all.cluster_summary.json")
        out_report = f(args.report_fn, "all.cluster_report.csv")
        out_hq_fa = f(args.hq_isoforms_fa, "all.polished_hq.fasta")
        out_lq_fa = f(args.lq_isoforms_fa, "all.polished_lq.fasta")
        out_hq_fq = f(args.hq_isoforms_fq, "all.polished_hq.fastq")
        out_lq_fq = f(args.lq_isoforms_fq, "all.polished_lq.fastq")

        ipq_opts = IceQuiverHQLQOptions(qv_trim_5=args.qv_trim_5,
                                        qv_trim_3=args.qv_trim_3,
                                        hq_quiver_min_accuracy=args.hq_quiver_min_accuracy)
        sample_name = get_sample_name(input_sample_name=args.sample_name)


        hq_fq_fns, lq_fq_fns = [], []
        split_uc_pickles, split_partial_uc_pickles = [], []
        split_consensus_isoforms = []

        for cluster_bin_dir in cluster_bin_dirs:
            ice_pq = IceQuiverPostprocess(root_dir=cluster_bin_dir, ipq_opts=ipq_opts)
            hq_fq_fns.append(ice_pq.quivered_good_fq)
            lq_fq_fns.append(ice_pq.quivered_bad_fq)
            split_uc_pickles.append(ice_pq.final_pickle_fn)
            split_partial_uc_pickles.append(ice_pq.nfl_all_pickle_fn)
            split_consensus_isoforms.append(ice_pq.final_consensus_fa)

        combined_files = CombinedFiles(combined_dir)
        log.info("Combining results of all cluster bins to %s.", combined_dir)
        log.info("Merging HQ|LQ isoforms from all cluster bins.")
        log.info("HQ isoforms are: %s.", ",".join(hq_fq_fns))
        log.info("LQ isoforms are: %s.", ",".join(lq_fq_fns))
        combine_polished_isoforms(split_indices=cluster_bin_indices,
                                  split_hq_fns=hq_fq_fns,
                                  split_lq_fns=lq_fq_fns,
                                  combined_hq_fa=combined_files.all_hq_fa,
                                  combined_hq_fq=combined_files.all_hq_fq,
                                  combined_lq_fa=combined_files.all_lq_fa,
                                  combined_lq_fq=combined_files.all_lq_fq,
                                  hq_lq_prefix_dict_pickle=combined_files.hq_lq_prefix_dict_pickle,
                                  sample_name=sample_name)

        ln(combined_files.all_hq_fa, out_hq_fa) #'HQ isoforms'
        ln(combined_files.all_hq_fq, out_hq_fq) #'HQ isoforms'
        ln(combined_files.all_lq_fa, out_lq_fa) #'LQ isoforms'
        ln(combined_files.all_lq_fq, out_lq_fq) #'LQ isoforms'

        log.info("Merging consensus isoforms from all cluster bins.")
        combine_consensus_isoforms(split_indices=cluster_bin_indices,
                                   split_files=split_consensus_isoforms,
                                   combined_consensus_isoforms_fa=combined_files.all_consensus_isoforms_fa,
                                   sample_name=sample_name)
        ln(combined_files.all_consensus_isoforms_fa, out_consensus_isoforms_fa)

        log.info("Writing cluster summary to %s", combined_files.all_cluster_summary_fn)
        write_cluster_summary(summary_fn=combined_files.all_cluster_summary_fn,
                              isoforms_fa=out_consensus_isoforms_fa,
                              hq_fa=out_hq_fa, lq_fa=out_lq_fa)
        ln(combined_files.all_cluster_summary_fn, out_summary) # "cluster summary"

        log.info("Writing cluster report to %s", combined_files.all_cluster_report_fn)
        write_combined_cluster_report(split_indices=cluster_bin_indices,
                                      split_uc_pickles=split_uc_pickles,
                                      split_partial_uc_pickles=split_partial_uc_pickles,
                                      report_fn=combined_files.all_cluster_report_fn,
                                      sample_name=sample_name)
        ln(combined_files.all_cluster_report_fn, out_report) # "cluster report"
Пример #51
0
 def setUp(self):
     """Define input and output file."""
     rmpath(_OUT_DIR_)
     mkdir(_OUT_DIR_)
Пример #52
0
 def __init__(self, combined_dir):
     self.combined_dir = realpath(combined_dir)
     mkdir(self.combined_dir)
Пример #53
0
 def cluster_dir(self, cid):
     """"overwrite IceQuiver.cluster_dir"""
     dir_name = IceQuiver.cluster_dir(self, cid)
     mkdir(dir_name)
     return dir_name
Пример #54
0
def make_pickle(in_pickle,
                out_pickle,
                root_dir,
                copy_consensus_isoforms=False,
                copy_flnc_pickle=False,
                copy_nfl_pickle=False,
                copy_quivered=False):
    """
    Copy cluster_out_dir in in_pickle to {root_dir}/bin_name/cluster_out/
    """
    mkdir(root_dir)

    def make_flnc(in_flnc, root_dir):
        bin_name = op.basename(op.dirname(in_flnc))
        flnc_name = op.basename(in_flnc)

        assert in_flnc.endswith(".contigset.xml")
        in_flnc_fa = in_flnc.replace(".contigset.xml", ".fasta")
        new_flnc = op.join(root_dir, bin_name, flnc_name)
        new_flnc_fa = new_flnc.replace(".contigset.xml", ".fasta")

        print "new_flnc = %s" % new_flnc
        shutil.copy(in_flnc_fa, new_flnc_fa)
        as_contigset(new_flnc_fa, new_flnc)

    def make_cluster_out_dir(in_dir, root_dir):
        bin_name = op.basename(op.dirname(in_dir))
        new_dir = op.join(
            root_dir, bin_name,
            "cluster_out")  #e.g., root_dir/0to1kb_part0/cluster_out
        mkdir(new_dir)
        return new_dir

    def _cp(task,
            new_task,
            copied_files,
            copy_consensus_isoforms=copy_consensus_isoforms,
            copy_flnc_pickle=copy_flnc_pickle,
            copy_nfl_pickle=copy_nfl_pickle):
        """Copy task.files to new_task.files."""
        if copy_consensus_isoforms is True and new_task.consensus_isoforms_file not in copied_files:
            shutil.copy(task.consensus_isoforms_file,
                        new_task.consensus_isoforms_file)
            copied_files[new_task.consensus_isoforms_file] = True
        if copy_flnc_pickle is True and new_task.flnc_pickle not in copied_files:
            mkdir(op.dirname(new_task.flnc_pickle))
            shutil.copy(task.flnc_pickle, new_task.flnc_pickle)
            copied_files[new_task.flnc_pickle] = True
        if copy_nfl_pickle is True and new_task.nfl_pickle not in copied_files:
            mkdir(op.dirname(new_task.nfl_pickle))
            shutil.copy(task.nfl_pickle, new_task.nfl_pickle)
            copied_files[new_task.nfl_pickle] = True


    print "making pickle from in_pickle %s to out_pickle %s, root_dir %s" % \
            (in_pickle, out_pickle, root_dir)

    p = ChunkTasksPickle.read(in_pickle)
    assert len(p) > 0
    if all([isinstance(task, ClusterChunkTask) for task in p]):
        outp = ChunkTasksPickle()
        copied_files = dict()
        for task in p:
            cluster_out_dir = make_cluster_out_dir(task.cluster_out_dir,
                                                   root_dir)
            print "new_cluster_out_dir is %s" % cluster_out_dir
            #flnc_file = make_flnc(task.flnc_file)
            new_task = ClusterChunkTask(task.cluster_bin_index, task.flnc_file,
                                        cluster_out_dir)
            _cp(task=task,
                new_task=new_task,
                copied_files=copied_files,
                copy_consensus_isoforms=copy_consensus_isoforms,
                copy_flnc_pickle=copy_flnc_pickle,
                copy_nfl_pickle=copy_nfl_pickle)
            outp.append(new_task)
        outp.write(out_pickle)
    elif all([isinstance(task, PartialChunkTask) for task in p]):
        outp = ChunkTasksPickle()
        copied_files = dict()
        for task in p:
            cluster_out_dir = make_cluster_out_dir(task.cluster_out_dir,
                                                   root_dir)
            print "new_cluster_out_dir is %s" % cluster_out_dir
            #flnc_file = make_flnc(task.flnc_file)
            new_task = PartialChunkTask(task.cluster_bin_index,
                                        task.flnc_file,
                                        cluster_out_dir,
                                        nfl_file=task.nfl_file,
                                        nfl_index=task.nfl_index,
                                        n_nfl_chunks=task.n_nfl_chunks)
            _cp(task=task,
                new_task=new_task,
                copied_files=copied_files,
                copy_consensus_isoforms=copy_consensus_isoforms,
                copy_flnc_pickle=copy_flnc_pickle,
                copy_nfl_pickle=copy_nfl_pickle)
            outp.append(new_task)
        outp.write(out_pickle)
    elif all([isinstance(task, PolishChunkTask) for task in p]):
        outp = ChunkTasksPickle()
        copied_files = dict()
        for task in p:
            cluster_out_dir = make_cluster_out_dir(task.cluster_out_dir,
                                                   root_dir)
            print "new_cluster_out_dir is %s" % cluster_out_dir
            #flnc_file = make_flnc(task.flnc_file)
            new_task = PolishChunkTask(task.cluster_bin_index,
                                       task.flnc_file,
                                       cluster_out_dir,
                                       polish_index=task.polish_index,
                                       n_polish_chunks=task.n_polish_chunks)
            mkdir(op.dirname(new_task.nfl_pickle))
            # always copy nfl_pickle for PolishChunkTask
            assert copy_nfl_pickle is True

            _cp(task=task,
                new_task=new_task,
                copied_files=copied_files,
                copy_consensus_isoforms=copy_consensus_isoforms,
                copy_flnc_pickle=copy_flnc_pickle,
                copy_nfl_pickle=copy_nfl_pickle)
            dst_dir = op.join(cluster_out_dir, "quivered")
            if copy_quivered is True and dst_dir not in copied_files:
                if op.exists(dst_dir):
                    shutil.rmtree(dst_dir)
                shutil.copytree(op.join(task.cluster_out_dir, "quivered"),
                                dst_dir)
                copied_files[dst_dir] = True
            outp.append(new_task)
        outp.write(out_pickle)
    else:
        assert False
Пример #55
0
from pbcore.io import FastqReader
from pbtranscript.io import CollapseGffReader, AbundanceReader, GroupReader
from pbtranscript.Utils import rmpath, mkdir
from pbtranscript.filtering.FilteringUtils import good_isoform_ids_by_count, \
    good_isoform_ids_by_removing_subsets, filter_by_count, filter_out_subsets

from test_setpath import DATA_DIR, OUT_DIR, SIV_DATA_DIR, SIV_STD_DIR

GROUP_FN = op.join(SIV_DATA_DIR, "test_filtering", "in.group.txt")
ABUNDANCE_FN = op.join(SIV_DATA_DIR, "test_filtering", "in.abundance.txt")
GFF_FN = op.join(SIV_DATA_DIR, "test_filtering", "in.gff")
REP_FN = op.join(SIV_DATA_DIR, "test_filtering", "in.rep.fastq")

_OUT_DIR_ = op.join(OUT_DIR, "test_filtering")
rmpath(_OUT_DIR_)
mkdir(_OUT_DIR_)


class TEST_FilteringUtils(unittest.TestCase):
    """Test functions of pbtranscript.filtering.FilteringUtils."""
    def setUp(self):
        """Define input and output file."""
        self.expected_good = ['PB.2.5', 'PB.5.1', 'PB.7.1', 'PB.10.2', 'PB.10.42', 'PB.12.1']
        self.expected_diff = ['PB.10.42', 'PB.10.36', 'PB.10.35']

    def test_good_isoform_ids_by_count(self):
        """Test good_isoform_ids_by_count"""
        good = good_isoform_ids_by_count(in_group_filename=GROUP_FN,
                                         in_abundance_filename=ABUNDANCE_FN,
                                         min_count=20)
        self.assertEqual(good, self.expected_good)
Пример #56
0
 def cluster_dir(self, cid):
     """"overwrite IceQuiver.cluster_dir"""
     dir_name = IceQuiver.cluster_dir(self, cid)
     mkdir(dir_name)
     return dir_name