예제 #1
0
    def _startPhmmers(self, chunked_reads_fns, chunked_dom_fns,
                      out_dom_fn, primer_fn, pbmatrix_fn):
        """Run phmmers on chunked reads files in 'chunked_reads_fns' and
        generate chunked dom files as listed in 'chunked_dom_fns', finally
        concatenate dom files to 'out_dom_fn'."""
        logging.info("Start to launch phmmer on chunked reads.")
        jobs = []
        for reads_fn, domFN in zip(chunked_reads_fns, chunked_dom_fns):
            p = multiprocessing.Process(
                target=self._phmmer,
                args=(reads_fn, domFN, primer_fn, pbmatrix_fn))
            jobs.append((p, domFN))
            p.start()

        for p, domFN in jobs:
            p.join()
            cmd = "cat {0} >> {1}".format(real_upath(domFN),
                                          real_upath(out_dom_fn))
            _output, errCode, errMsg = backticks(cmd)
            if errCode != 0:
                raise ClassifierException(
                    "Error concatenating dom files: {e}".
                    format(e=str(errMsg)))

        self._cleanup(chunked_reads_fns)
        self._cleanup(chunked_dom_fns)
예제 #2
0
    def _startPhmmers(self, chunked_reads_fns, chunked_dom_fns,
                      out_dom_fn, primer_fn, pbmatrix_fn):
        """Run phmmers on chunked reads files in 'chunked_reads_fns' and
        generate chunked dom files as listed in 'chunked_dom_fns', finally
        concatenate dom files to 'out_dom_fn'."""
        logging.info("Start to launch phmmer on chunked reads.")
        jobs = []
        for reads_fn, domFN in zip(chunked_reads_fns, chunked_dom_fns):
            p = multiprocessing.Process(
                target=self._phmmer,
                args=(reads_fn, domFN, primer_fn, pbmatrix_fn))
            jobs.append((p, domFN))
            p.start()

        for p, domFN in jobs:
            p.join()
            cmd = "cat {0} >> {1}".format(real_upath(domFN),
                                          real_upath(out_dom_fn))
            _output, errCode, errMsg = backticks(cmd)
            if errCode != 0:
                raise ClassifierException(
                    "Error concatenating dom files: {e}".
                    format(e=str(errMsg)))

        self._cleanup(chunked_reads_fns)
        self._cleanup(chunked_dom_fns)
예제 #3
0
 def _phmmer(self, reads_fn, domFN, primer_fn, pbmaxtrixFN):
     """Invoke phmmer once."""
     cmd = "phmmer --cpu 1 --domtblout {d} --noali --domE 1 ".\
           format(d=real_upath(domFN)) + \
           "--mxfile {m} ".format(m=real_upath(pbmaxtrixFN)) + \
           "--popen 0.07 --pextend 0.07 {r} {p} > /dev/null".\
           format(r=real_upath(reads_fn), p=real_upath(primer_fn))
     logging.debug("Calling phmmer: {cmd}".format(cmd=cmd))
     _output, errCode, errMsg = backticks(cmd)
     if (errCode != 0):
         raise ClassifierException(
             "Error calling phmmer: {e}.".format(e=str(errMsg)))
예제 #4
0
 def _phmmer(self, reads_fn, domFN, primer_fn, pbmaxtrixFN):
     """Invoke phmmer once."""
     cmd = "phmmer --cpu 1 --domtblout {d} --noali --domE 1 ".\
           format(d=real_upath(domFN)) + \
           "--mxfile {m} ".format(m=real_upath(pbmaxtrixFN)) + \
           "--popen 0.07 --pextend 0.07 {r} {p} > /dev/null".\
           format(r=real_upath(reads_fn), p=real_upath(primer_fn))
     logging.debug("Calling phmmer: {cmd}".format(cmd=cmd))
     _output, errCode, errMsg = backticks(cmd)
     if (errCode != 0):
         raise ClassifierException(
             "Error calling phmmer: {e}.".format(e=str(errMsg)))
예제 #5
0
def build_sa(input_fasta, out_sa):
    """Generate suffix array of input_fasta"""
    if op.exists(input_fasta):
        cmd = "sawriter {o} {i} -blt 8 -welter ".\
            format(o=real_upath(out_sa), i=real_upath(input_fasta))
        _out, _code, _msg = backticks(cmd)
        if _code == 0:
            return True
        else:
            # If failed to generate suffix array, warning.
            logging.warn("Unable to create suffix array for {f}.".format(f=input_fasta))
            return False
    else:
        raise IOError("Unable to find fasta file {f}.".format(f=input_fasta))
예제 #6
0
def build_sa(input_fasta, out_sa):
    """Generate suffix array of input_fasta"""
    if op.exists(input_fasta):
        cmd = "sawriter {o} {i} -blt 8 -welter ".\
            format(o=real_upath(out_sa), i=real_upath(input_fasta))
        _out, _code, _msg = backticks(cmd)
        if _code == 0:
            return True
        else:
            # If failed to generate suffix array, warning.
            logging.warn("Unable to create suffix array for {f}.".format(f=input_fasta))
            return False
    else:
        raise IOError("Unable to find fasta file {f}.".format(f=input_fasta))
예제 #7
0
def sanity_check_sge(sge_opts, scriptDir, testDirName="gcon_test_dir"):
    """Sanity check if sge can work."""
    scriptDir = realpath(scriptDir)
    testDir = op.join(scriptDir, testDirName)

    if not op.exists(scriptDir):
        os.makedirs(scriptDir)
    if not op.exists(testDir):
        os.makedirs(testDir)

    testSh = op.join(scriptDir, 'test.sh')
    consensusFa = op.join(testDir, "g_consensus.fasta")
    testInFa = op.join(testDir, "gcon_in.fa")
    if op.exists(testInFa):
        os.remove(testInFa)
    shutil.copy(GCON_IN_FA, testInFa)
    assert(op.exists(testInFa))

    with open(testSh, 'w') as f:
        f.write("#!/bin/bash\n")
        f.write("{gcon}".format(gcon=gcon_py) +
                " {inFa} ".format(inFa=real_upath(testInFa)) +
                " {testDir}/g_consensus".format(testDir=real_upath(testDir)) +
                " c1\n")

    assert(op.exists(testSh))
    cmd = "qsub"
    if sge_opts.sge_queue is not None:
        cmd += " -q " + sge_opts.sge_queue
    cmd += " -sync y -pe {env} 1 -cwd -S /bin/bash -V -e /dev/null -o /dev/null {t}".\
          format(t=real_upath(testSh), env=sge_opts.sge_env_name)
    logging.info("Submitting cmd: " + cmd)
    _out, _code, _msg = backticks(cmd)

#    answer = FastaReader(GCON_OUT_FA).__iter__().next()
#    tester = FastaReader(consensusFa).__iter__().next()
#
#    if answer.name != tester.name or \
#       answer.sequence != tester.sequence:
    if not filecmp.cmp(consensusFa, GCON_OUT_FA):
        errMsg = "Trouble running qsub or output is not as " + \
                 "expected ({0} and {1} must agree). Abort!".format(
                     consensusFa, GCON_OUT_FA)
        logging.error(errMsg)
        return False
    else:
        shutil.rmtree(testDir)
        logging.info("sge and gcon check passed.")
        return True
예제 #8
0
def sanity_check_sge(sge_opts, scriptDir, testDirName="gcon_test_dir"):
    """Sanity check if sge can work."""
    scriptDir = realpath(scriptDir)
    testDir = op.join(scriptDir, testDirName)

    if not op.exists(scriptDir):
        os.makedirs(scriptDir)
    if not op.exists(testDir):
        os.makedirs(testDir)

    testSh = op.join(scriptDir, 'test.sh')
    consensusFa = op.join(testDir, "g_consensus.fasta")
    testInFa = op.join(testDir, "gcon_in.fa")
    if op.exists(testInFa):
        os.remove(testInFa)
    shutil.copy(GCON_IN_FA, testInFa)
    assert(op.exists(testInFa))

    with open(testSh, 'w') as f:
        f.write("#!/bin/bash\n")
        f.write("{gcon}".format(gcon=gcon_py) +
                " {inFa} ".format(inFa=real_upath(testInFa)) +
                " {testDir}/g_consensus".format(testDir=real_upath(testDir)) +
                " c1\n")

    assert(op.exists(testSh))
    cmd = "qsub"
    if sge_opts.sge_queue is not None:
        cmd += " -q " + sge_opts.sge_queue
    cmd += " -sync y -pe {env} 1 -cwd -S /bin/bash -V -e /dev/null -o /dev/null {t}".\
          format(t=real_upath(testSh), env=sge_opts.sge_env_name)
    logging.info("Submitting cmd: " + cmd)
    _out, _code, _msg = backticks(cmd)

#    answer = FastaReader(GCON_OUT_FA).__iter__().next()
#    tester = FastaReader(consensusFa).__iter__().next()
#
#    if answer.name != tester.name or \
#       answer.sequence != tester.sequence:
    if not filecmp.cmp(consensusFa, GCON_OUT_FA):
        errMsg = "Trouble running qsub or output is not as " + \
                 "expected ({0} and {1} must agree). Abort!".format(
                     consensusFa, GCON_OUT_FA)
        logging.error(errMsg)
        return False
    else:
        shutil.rmtree(testDir)
        logging.info("sge and gcon check passed.")
        return True
예제 #9
0
 def numReads(self):
     """Return the number of reads in reads_fn."""
     cmd = "grep -c '>' {r}".format(r=real_upath(self.reads_fn))
     output, errCode, errMsg = backticks(cmd)
     if errCode != 0:
         raise ClassifierException("Error reading file {r}:{e}".format(
             r=self.reads_fn, e=str(errMsg)))
     return int(output[0])
예제 #10
0
 def numReads(self):
     """Return the number of reads in reads_fn."""
     cmd = "grep -c '>' {r}".format(r=real_upath(self.reads_fn))
     output, errCode, errMsg = backticks(cmd)
     if errCode != 0:
         raise ClassifierException(
             "Error reading file {r}:{e}".
             format(r=self.reads_fn, e=str(errMsg)))
     return int(output[0])
예제 #11
0
 def _align_withBLASR(self, queryFa, targetFa, outFN, ice_opts, sge_opts):
     """Align input reads against itself using BLASR."""
     if os.path.exists(outFN):
         logging.info(
             "{0} already exists. No need to run BLASR.".format(outFN))
     else:
         cmd = "blasr {q} ".format(q=real_upath(queryFa)) + \
               "{t} ".format(t=real_upath(targetFa)) + \
               "-m 5 -maxLCPLength 15 " + \
               "-nproc {cpu} ".format(cpu=sge_opts.blasr_nproc) + \
               "-maxScore {score} ".format(score=ice_opts.maxScore) + \
               "-bestn {n} -nCandidates {n} ".format(n=ice_opts.bestn) + \
               "-out {o}".format(o=real_upath(outFN))
         logging.info("Calling {cmd}".format(cmd=cmd))
         _output, code, msg = backticks(cmd)
         if code != 0:
             errMsg = "{cmd} exited with {code}: {msg}".\
                     format(cmd=cmd, code=code, msg=msg)
             logging.error(errMsg)
             raise RuntimeError(errMsg)
예제 #12
0
 def _align_withBLASR(self, queryFa, targetFa, outFN, ice_opts, sge_opts):
     """Align input reads against itself using BLASR."""
     if os.path.exists(outFN):
         logging.info("{0} already exists. No need to run BLASR.".
             format(outFN))
     else:
         cmd = "blasr {q} ".format(q=real_upath(queryFa)) + \
               "{t} ".format(t=real_upath(targetFa)) + \
               "-m 5 -maxLCPLength 15 " + \
               "-nproc {cpu} ".format(cpu=sge_opts.blasr_nproc) + \
               "-maxScore {score} ".format(score=ice_opts.maxScore) + \
               "-bestn {n} -nCandidates {n} ".format(n=ice_opts.bestn) + \
               "-out {o}".format(o=real_upath(outFN))
         logging.info("Calling {cmd}".format(cmd=cmd))
         _output, code, msg = backticks(cmd)
         if code != 0:
             errMsg = "{cmd} exited with {code}: {msg}".\
                     format(cmd=cmd, code=code, msg=msg)
             logging.error(errMsg)
             raise RuntimeError (errMsg)
예제 #13
0
def blasr_sam_for_quiver(input_fasta, ref_fasta,
                         out_sam_filename,
                         run_cmd=True, blasr_nproc=12):
    """
    input_fasta --- should be in.raw.fa
    ref_fasta --- reference fasta (ex: g_consensus.fa) to align to
    out_sam_filename --- sam output aligning in_fasta to ref_fasta

    run blasr -clipping soft to get sam
    """
    cmd = "blasr {i} ".format(i=real_upath(input_fasta)) + \
          "{r} ".format(r=real_upath(ref_fasta)) + \
          "-nproc {n} ".format(n=blasr_nproc) + \
          "-bestn 5 -nCandidates 10 -sam -clipping soft " + \
          "-out {o}".format(o=real_upath(out_sam_filename))
    logging.debug("CMD: " + cmd)
    if run_cmd:
        _out, _code, _msg = backticks(cmd)
        if _code != 0:
            raise RuntimeError("CMD failed: {cmd}\n{e}".
                               format(cmd=cmd, e=_msg))
    return cmd
예제 #14
0
def blasr_sam_for_quiver(input_fasta, ref_fasta,
                         out_sam_filename,
                         run_cmd=True, blasr_nproc=12):
    """
    input_fasta --- should be in.raw.fa
    ref_fasta --- reference fasta (ex: g_consensus.fa) to align to
    out_sam_filename --- sam output aligning in_fasta to ref_fasta

    run blasr -clipping soft to get sam
    """
    cmd = "blasr {i} ".format(i=real_upath(input_fasta)) + \
          "{r} ".format(r=real_upath(ref_fasta)) + \
          "-nproc {n} ".format(n=blasr_nproc) + \
          "-bestn 5 -nCandidates 10 -sam -clipping soft " + \
          "-out {o}".format(o=real_upath(out_sam_filename))
    logging.debug("CMD: " + cmd)
    if run_cmd:
        _out, _code, _msg = backticks(cmd)
        if _code != 0:
            raise RuntimeError("CMD failed: {cmd}\n{e}".
                               format(cmd=cmd, e=_msg))
    return cmd
예제 #15
0
    def submit_todo_quiver_jobs(self, todo, submitted, sge_opts):
        """
        todo --- a list of sh scripts to run
        submitted --- a list of sh scripts which have been submitted
        sge_opts --- SGE options, including
                     use_sge, whether or not to use sge
                     max_sge_jobs, maximum number sge jobs to submit
                     quiver_nproc, number of nproc per job
                     unique_id, unique id to name qsub jobs
        """
        time0 = datetime.now()
        if sge_opts.use_sge is not True or \
           sge_opts.max_sge_jobs == 0:  # don't use SGE
            for job in todo:
                elog = op.join(self.quivered_log_dir,
                               op.basename(job) + ".elog")
                olog = op.join(self.quivered_log_dir,
                               op.basename(job) + ".olog")
                cmd = "bash " + real_upath(job) + " 1>{olog} 2>{elog}".\
                      format(olog=real_upath(olog), elog=real_upath(elog))
                self.run_cmd_and_log(cmd,
                                     olog=olog,
                                     elog=elog,
                                     description="Failed to run Quiver")
                submitted.append(("local", job))
            todo = []
        else:
            while len(todo) > 0:
                n = min(sge_opts.max_sge_jobs, len(todo))
                for job in todo[:n]:
                    # ex: Your job 8613116 ("c20to70.sh") has been submitted
                    elog = op.join(self.quivered_log_dir,
                                   op.basename(job) + ".elog")
                    olog = op.join(self.quivered_log_dir,
                                   op.basename(job) + ".olog")
                    jid = "ice_quiver_{unique_id}_{name}".format(
                        unique_id=self.sge_opts.unique_id,
                        name=op.basename(job))

                    qsub_cmd = "qsub"
                    if self.sge_opts.sge_queue is not None:
                        qsub_cmd += " -q " + self.sge_opts.sge_queue
                    qsub_cmd += " -pe {env} {n} ".format(n=sge_opts.quiver_nproc, env=sge_opts.sge_env_name) + \
                               "-cwd -S /bin/bash -V " + \
                               "-e {elog} ".format(elog=real_upath(elog)) +\
                               "-o {olog} ".format(olog=real_upath(olog)) +\
                               "-N {jid} ".format(jid=jid) + \
                               "{job}".format(job=real_upath(job))
                    job_id = self.qsub_cmd_and_log(qsub_cmd)

                    submitted.append((job_id, job))
                    todo.remove(job)
                # end of for job in todo[:n]
            # end of while len(todo) > 0
        # end of else (use sge)
        self.add_log("Total time submitting todo quiver jobs: {0}".format(
            datetime.now() - time0))
예제 #16
0
    def submit_todo_quiver_jobs(self, todo, submitted, sge_opts):
        """
        todo --- a list of sh scripts to run
        submitted --- a list of sh scripts which have been submitted
        sge_opts --- SGE options, including
                     use_sge, whether or not to use sge
                     max_sge_jobs, maximum number sge jobs to submit
                     quiver_nproc, number of nproc per job
                     unique_id, unique id to name qsub jobs
        """
        time0 = datetime.now()
        if sge_opts.use_sge is not True or \
           sge_opts.max_sge_jobs == 0:  # don't use SGE
            for job in todo:
                elog = op.join(self.quivered_log_dir,
                               op.basename(job) + ".elog")
                olog = op.join(self.quivered_log_dir,
                               op.basename(job) + ".olog")
                cmd = "bash " + real_upath(job) + " 1>{olog} 2>{elog}".\
                      format(olog=real_upath(olog), elog=real_upath(elog))
                self.run_cmd_and_log(cmd, olog=olog, elog=elog,
                                     description="Failed to run Quiver")
                submitted.append(("local", job))
            todo = []
        else:
            while len(todo) > 0:
                n = min(sge_opts.max_sge_jobs, len(todo))
                for job in todo[:n]:
                    # ex: Your job 8613116 ("c20to70.sh") has been submitted
                    elog = op.join(self.quivered_log_dir,
                                   op.basename(job) + ".elog")
                    olog = op.join(self.quivered_log_dir,
                                   op.basename(job) + ".olog")
                    jid = "ice_quiver_{unique_id}_{name}".format(
                        unique_id=self.sge_opts.unique_id,
                        name=op.basename(job))

                    qsub_cmd = "qsub"
                    if self.sge_opts.sge_queue is not None:
                        qsub_cmd += " -q " + self.sge_opts.sge_queue
                    qsub_cmd += " -pe {env} {n} ".format(n=sge_opts.quiver_nproc, env=sge_opts.sge_env_name) + \
                               "-cwd -S /bin/bash -V " + \
                               "-e {elog} ".format(elog=real_upath(elog)) +\
                               "-o {olog} ".format(olog=real_upath(olog)) +\
                               "-N {jid} ".format(jid=jid) + \
                               "{job}".format(job=real_upath(job))
                    job_id = self.qsub_cmd_and_log(qsub_cmd)

                    submitted.append((job_id, job))
                    todo.remove(job)
                # end of for job in todo[:n]
            # end of while len(todo) > 0
        # end of else (use sge)
        self.add_log("Total time submitting todo quiver jobs: {0}".format(datetime.now()-time0))
예제 #17
0
    def quiver_cmds_for_bin(self, cids, quiver_nproc=2):
        """Return a list of quiver related cmds, to convert sam & ref to cmp.h5
        and call quiver, including samtoh5, loadPulses, comph5tools.py,
        samtools, loadChemistry, quiver...
        """
        first, last = cids[0], cids[-1]
        self.add_log("Creating quiver cmds for c{first} to c{last}".
                     format(first=first, last=last))

        bin_sam_file = self.sam_of_quivered_bin(first, last)
        bin_ref_fa = self.ref_fa_of_quivered_bin(first, last)
        bin_cmph5 = self.cmph5_of_quivered_bin(first, last)
        bin_fq = self.fq_of_quivered_bin(first, last)

        cmds = []
        cmds.append("samtoh5 {sam} {ref} {cmph5} -smrtTitle".format(
            sam=real_upath(bin_sam_file),
            ref=real_upath(bin_ref_fa),
            cmph5=real_upath(bin_cmph5)))
        # (Liz) don't gzip the sa
        #cmds.append("gzip {sam}".format(sam=real_upath(bin_sam_file)))
        metrics = ["QualityValue", "InsertionQV", "MergeQV", "DeletionQV",
                   "DeletionTag", "SubstitutionTag", "SubstitutionQV"]
        cmds.append("loadPulses {bas_fofn} ".
                    format(bas_fofn=real_upath(self.bas_fofn)) +
                    "{cmph5} ".format(cmph5=real_upath(bin_cmph5)) +
                    "-byread -metrics " + ",".join(metrics))
        cmds.append("cmph5tools.py sort {cmph5}".
                    format(cmph5=real_upath(bin_cmph5)))
        cmds.append("samtools faidx {ref}".format(ref=real_upath(bin_ref_fa)))
        cmds.append("loadChemistry.py {bas_fofn} {cmph5}".
                    format(bas_fofn=real_upath(self.bas_fofn),
                           cmph5=real_upath(bin_cmph5)))
        cmds.append("quiver {cmph5} ".format(cmph5=real_upath(bin_cmph5)) +
                    "-v -j{n} ".format(n=quiver_nproc) +
                    "-r {ref} ".format(ref=real_upath(bin_ref_fa)) +
                    "-o {fq}".format(fq=real_upath(bin_fq)))
        return cmds
예제 #18
0
def convert_fofn_to_fasta(fofn_filename, out_filename, fasta_out_dir,
                          force_overwrite=False, cpus=1):
    """
    For each .bax.h5 file, create .bax.h5.fasta file and save paths to
    out_filename, which should usually be 'input.fasta.fofn'
    """
    logging.info("Converting fofn {fofn} to fasta.".format(fofn=fofn_filename))
    in_fns = get_files_from_fofn(fofn_filename)
    #out_fns = []
    mkdir(fasta_out_dir)

    # multiprocessing worker stuff
    manager = Manager()
    in_queue = manager.Queue(len(in_fns))
    in_queue_count = 0
    outfile_track = {} # expected out file --> (cmd, tmp)
    pool = []
    out_fns = []

    for in_fn in in_fns:
        #print >> sys.stderr, "DEBUG: converting h5 file:", in_fn
        logging.debug("converting h5 file: {f}.".format(f=in_fn))
        if not (in_fn.endswith('.bax.h5') or in_fn.endswith('.bas.h5')):
            raise ValueError("fofn file {fofn} ".format(fofn=fofn_filename) +
                             "should only contain bax/bas.h5 files.")

        # e.g. m111xxxx.1.bax.h5 ==>
        #      tmp_out_file = m11xxxx.1.bax.h5.fasta.tmp
        #      out_file = m11xxxx.1.bax.h5.fasta
        in_basename = op.basename(in_fn)
        tmp_out_file = op.join(fasta_out_dir, in_basename + '.fasta.tmp')
        out_file = op.join(fasta_out_dir, in_basename + '.fasta')
        if op.exists(out_file) and not force_overwrite:
            logging.debug("File {0} already exists. skipping.".format(out_file))
            out_fns.append(out_file)
            if op.exists(tmp_out_file):
                os.remove(tmp_out_file)
        else:
            cmd = "pls2fasta {in_fn} ".format(in_fn=real_upath(in_fn)) + \
                  " {out} ".format(out=real_upath(tmp_out_file)) + \
                  "-minSubreadLength 300 -minReadScore 750 -trimByRegion"
            print >> sys.stderr, "DEBUG: putting in queue:", cmd, tmp_out_file, out_file
            in_queue.put((cmd, tmp_out_file, out_file))
            in_queue_count += 1
            outfile_track[out_file] = (cmd, tmp_out_file)
            print >> sys.stderr, "DEBUG: put in queue:", cmd, tmp_out_file, out_file

    cpus = min(cpus, in_queue_count) # cap max CPU if there's fewer files to convert
    for i in xrange(cpus):
        p = Process(target=convert_fofn_to_fasta_worker, args=(in_queue,))
        pool.append(p)

    #error_flag = False
    # starting & joining pool worakers
    for p in pool:
        p.start()
        #print >> sys.stderr, "Starting worker", p.name
    for p in pool:
        #print >> sys.stderr, "Waiting join", p.name
        p.join(timeout=1200)
        if p.is_alive(): p.terminate()

    # check that all files exists
    # if it does not, force to run locally
    for out_file,(cmd, tmp_out_file) in outfile_track.iteritems():
        in_queue.put((cmd, tmp_out_file, out_file))
        convert_fofn_to_fasta_worker(in_queue)
        out_fns.append(out_file)

    #if error_flag:
    #    raise Exception, "Unable to successfuly run convert_fofn_to_fasta, ABORT!"

    write_files_to_fofn(out_fns, out_filename)
예제 #19
0
def build_uc_from_partial(input_fasta, ref_fasta, out_pickle,
                          sa_file=None, ccs_fofn=None,
                          done_filename=None, blasr_nproc=12, use_finer_qv=False):
    """
    Given an input_fasta file of non-full-length (partial) reads and
    (unpolished) consensus isoforms sequences in ref_fasta, align reads to
    consensus isoforms using BLASR, and then build up a mapping between
    consensus isoforms and reads (i.e., assign reads to isoforms).
    Finally, save
        {isoform_id: [read_ids],
         nohit: set(no_hit_read_ids)}
    to an output pickle file.

    ccs_fofn --- If None, assume no quality value is available,
    otherwise, use QV from ccs_fofn.
    blasr_nproc --- equivalent to blasr -nproc, number of CPUs to use
    """
    input_fasta = realpath(input_fasta)
    m5_file = input_fasta + ".blasr"
    out_pickle = realpath(out_pickle)
    if sa_file is None:
        if op.exists(input_fasta + ".sa"):
            sa_file = input_fasta + ".sa"

    cmd = "blasr {i} ".format(i=real_upath(input_fasta)) + \
          "{r} -bestn 5 ".format(r=real_upath(ref_fasta)) + \
          "-nproc {n} -m 5 ".format(n=blasr_nproc) + \
          "-maxScore -1000 -minPctIdentity 85 " + \
          "-out {o} ".format(o=real_upath(m5_file))
    if sa_file is not None and op.exists(sa_file):
        cmd += "-sa {sa}".format(sa=real_upath(sa_file))

    logging.info("CMD: {cmd}".format(cmd=cmd))
    _out, _code, _msg = backticks(cmd)
    if _code != 0:
        errMsg = "Command failed: {cmd}\n{e}".format(cmd=cmd, e=_msg)
        logging.error(errMsg)
        raise RuntimeError(errMsg)
    
    if ccs_fofn is None:
        logging.info("Loading probability from model (0.01,0.07,0.06)")
        probqv = ProbFromModel(.01, .07, .06)
    else:
        start_t = time.time()
        if use_finer_qv:
            logging.info("Loading QVs from {i} + {f} took {s} secs".format(f=ccs_fofn, i=input_fasta,\
                    s=time.time()-start_t))
            probqv = ProbFromQV(input_fofn=ccs_fofn, fasta_filename=input_fasta)
        else:
            input_fastq = input_fasta[:input_fasta.rfind('.')] + '.fastq'
            logging.info("Converting {i} + {f} --> {fq}".format(i=input_fasta, f=ccs_fofn, fq=input_fastq))
            ice_fa2fq(input_fasta, ccs_fofn, input_fastq)
            logging.info("Loading QVs from {fq} took {s} secs".format(fq=input_fastq, s=time.time()-start_t))
            probqv = ProbFromFastq(input_fastq)


    logging.info("Calling blasr_against_ref ...")
    hitItems = blasr_against_ref(output_filename=m5_file,
                                 is_FL=False,
                                 sID_starts_with_c=True,
                                 qvmean_get_func=probqv.get_mean,
                                 qver_get_func=probqv.get_smoothed,
                                 ece_penalty=1,
                                 ece_min_len=10,
                                 same_strand_only=False)

    partial_uc = {}  # Maps each isoform (cluster) id to a list of reads
    # which can map to the isoform
    seen = set()  # reads seen
    logging.info("Building uc from BLASR hits.")
    for h in hitItems:
        if h.ece_arr is not None:
            if h.cID not in partial_uc:
                partial_uc[h.cID] = set()
            partial_uc[h.cID].add(h.qID)
            seen.add(h.qID)

    for k in partial_uc:
        partial_uc[k] = list(partial_uc[k])

    allhits = set(r.name.split()[0] for r in FastaReader(input_fasta))

    logging.info("Counting reads with no hit.")
    nohit = allhits.difference(seen)

    logging.info("Dumping uc to a pickle: {f}.".format(f=out_pickle))
    with open(out_pickle, 'w') as f:
        dump({'partial_uc': partial_uc, 'nohit': nohit}, f)

    os.remove(m5_file)

    done_filename = realpath(done_filename) if done_filename is not None \
        else out_pickle + '.DONE'
    logging.debug("Creating {f}.".format(f=done_filename))
    touch(done_filename)
예제 #20
0
def concat_sam(samfiles, outsam_filename):
    """
    Header looks like:
    @HD     VN:1.3.1
    @SQ     SN:c31  LN:3104 M5:ef7d3f84dea9d9face43e6fd5b6336c4
    @RG     ID:2caa54eef6   PU:in.raw_with_partial.fa       SM:NO_CHIP_ID
    @PG     ID:BLASR        VN:1.3.1.126469 CL:blasr in.raw_with_partial.fa g_consensus.fa -nproc 12 -bestn 5 -nCandidates 10 -sam -out out.sam

    NOTE: check for M5 conflicts; manipulate them if it conflicts
    """
    f_sq = open(outsam_filename + '.sq', 'w')
    f_bd = open(outsam_filename + '.bd', 'w')

    rg_line = None
    pg_line = None

    md5_seen = set()

    if len(samfiles) == 0:
        raise ValueError("No sam input files to concatenate.")

    h = open(samfiles[0])
    line = h.readline()
    assert line.startswith('@HD')
    f_sq.write(line)
    line = h.readline()
    assert line.startswith('@SQ')
    line = h.readline()
    assert line.startswith('@RG')
    rg_line = line  # write at the end
    line = h.readline()
    assert line.startswith('@PG')
    pg_line = line  # write at the end
    h.close()

    for f in samfiles:
        with open(f) as h:
            assert h.readline().startswith('@HD')
            line = h.readline()
            assert line.startswith('@SQ')
            # ------- check for MD5 conflicts ----------- #
            m5 = line.strip().split()[-1]
            assert m5.startswith("M5:")
            if m5 not in md5_seen:
                f_sq.write(line)
                md5_seen.add(m5)
            else:
                s = list(m5[3:])
                while True:
                    # create a random m5 string.
                    random.shuffle(s)
                    s = "".join(s)
                    if s not in md5_seen:
                        break
                line = line[:line.find('M5:')] + 'M5:' + s + '\n'
                logging.debug("MD5 conflict: change to {0}".format(s))
                md5_seen.add(s)
                f_sq.write(line)
            # ----- end MD5 checking and writing --------- #
            assert h.readline().startswith('@RG')
            assert h.readline().startswith('@PG')
            for line in h:
                f_bd.write(line)

    f_bd.close()
    f_sq.write(rg_line)
    f_sq.write(pg_line)
    f_sq.close()

    cmd = "cat {0}.sq {0}.bd > {0}".format(real_upath(outsam_filename))
    _out, _code, _msg = backticks(cmd)
    if _code != 0:
        raise IOError("Failed to concat sam files! Abort." + _msg)

    os.remove(f_sq.name)
    os.remove(f_bd.name)
예제 #21
0
    def createPickles(self):
        """For each file in fastq_filenames, call 'ice_partial.py one' to
        build clusters and to save results to a pickle file. When all pickles
        are done, union all pickles.
        """
        self.add_log("Mapping non-full-length reads to consensus isoforms.")
        self.add_log("Creating pickles...", level=logging.INFO)

        # using --blasr_nproc=4 because DALIGNER uses only 4 cores
        for idx, fq in enumerate(self.fastq_filenames):
            # for each splitted non-full-length reads fasta file, build #
            # partial_uc.pickle
            cmd = "ice_partial.py one {i} ".format(i=real_upath(fq)) + \
                  "{r} ".format(r=real_upath(self.ref_fasta)) + \
                  "{o} ".format(o=real_upath(self.pickle_filenames[idx])) + \
                  "--blasr_nproc={n} ".format(n=4) + \
                  "--done={d} ".format(d=real_upath(self.done_filenames[idx]))
            if self.ccs_fofn is not None:
                cmd += "--ccs_fofn={f} ".format(f=real_upath(self.ccs_fofn))
            if self.sa_file is not None:
                cmd += "--sa={sa} ".format(sa=real_upath(self.sa_file))

            self.add_log("Writing command to script {fsh}".
                         format(fsh=self.script_filenames[idx]))
            self.add_log("CMD: {0}".format(cmd))
            with open(self.script_filenames[idx], 'w') as fsh:
                fsh.write(cmd + "\n")

            # determine elog & olog
            partial_log_fn = op.join(self.log_dir,
                                     'IcePartial.{idx}'.format(idx=idx))
            elog = partial_log_fn + ".elog"
            olog = partial_log_fn + ".olog"
            jid = "ice_partial_{unique_id}_{name}".format(
                unique_id=self.sge_opts.unique_id,
                name=op.basename(fq))

            qsub_cmd = "qsub"
            if self.sge_opts.sge_queue is not None:
                qsub_cmd += " -q " + self.sge_opts.sge_queue
            qsub_cmd += " -pe {env} {n} ".format(env=self.sge_opts.sge_env_name, n=4) + \
                       "-cwd -S /bin/bash -V " + \
                       "-e {elog} ".format(elog=real_upath(elog)) + \
                       "-o {olog} ".format(olog=real_upath(olog)) + \
                       "-N {jid} ".format(jid=jid) + \
                       "{sh}".format(sh=real_upath(self.script_filenames[idx]))

            self.add_log("Creating a pickle for {f}".format(f=fq))

            if self.sge_opts.use_sge is True:
                self.qsub_cmd_and_log(qsub_cmd)
            else:
                cmd += " 1>{olog} 2>{elog}".format(olog=real_upath(olog),
                                                   elog=real_upath(elog))
                self.run_cmd_and_log(cmd=cmd, olog=olog, elog=elog)
예제 #22
0
def convert_fofn_to_fasta(fofn_filename,
                          out_filename,
                          fasta_out_dir,
                          force_overwrite=False,
                          cpus=1):
    """
    For each .bax.h5 file, create .bax.h5.fasta file and save paths to
    out_filename, which should usually be 'input.fasta.fofn'
    """
    logging.info("Converting fofn {fofn} to fasta.".format(fofn=fofn_filename))
    in_fns = get_files_from_fofn(fofn_filename)
    #out_fns = []
    mkdir(fasta_out_dir)

    # multiprocessing worker stuff
    manager = Manager()
    out_fns = manager.list()
    in_queue = manager.Queue(99999)
    pool = []
    for i in xrange(cpus):
        p = Process(target=convert_fofn_to_fasta_worker,
                    args=(in_queue, out_fns))
        pool.append(p)

    for in_fn in in_fns:
        #print >> sys.stderr, "DEBUG: converting h5 file:", in_fn
        logging.debug("converting h5 file: {f}.".format(f=in_fn))
        if not (in_fn.endswith('.bax.h5') or in_fn.endswith('.bas.h5')):
            raise ValueError("fofn file {fofn} ".format(fofn=fofn_filename) +
                             "should only contain bax/bas.h5 files.")

        # e.g. m111xxxx.1.bax.h5 ==>
        #      tmp_out_file = m11xxxx.1.bax.h5.fasta.tmp
        #      out_file = m11xxxx.1.bax.h5.fasta
        in_basename = op.basename(in_fn)
        tmp_out_file = op.join(fasta_out_dir, in_basename + '.fasta.tmp')
        out_file = op.join(fasta_out_dir, in_basename + '.fasta')
        if op.exists(out_file) and not force_overwrite:
            logging.debug(
                "File {0} already exists. skipping.".format(out_file))
            out_fns.append(out_file)
            if op.exists(tmp_out_file):
                os.remove(tmp_out_file)
        else:
            cmd = "pls2fasta {in_fn} ".format(in_fn=real_upath(in_fn)) + \
                  " {out} ".format(out=real_upath(tmp_out_file)) + \
                  "-minSubreadLength 300 -minReadScore 750 -trimByRegion"
            print >> sys.stderr, "DEBUG: putting in queue:", cmd, tmp_out_file, out_file
            in_queue.put((cmd, tmp_out_file, out_file))
            print >> sys.stderr, "DEBUG: put in queue:", cmd, tmp_out_file, out_file


#            logging.debug("CMD: {cmd}".format(cmd=cmd))
#            _out, _code, _msg = backticks(cmd)
#            if _code != 0:
#                raise RuntimeError("CMD failed: {cmd}\n".format(cmd=cmd) + _msg)
#            trim_subread_flanks(tmp_out_file, out_file)
#        out_fns.append(out_file)
#        if op.exists(tmp_out_file):
#            os.remove(tmp_out_file)

# starting & joining pool worakers
    for p in pool:
        p.start()
        #print >> sys.stderr, "Starting worker", p.name
    for p in pool:
        #print >> sys.stderr, "Waiting join", p.name
        p.join()

    write_files_to_fofn(out_fns, out_filename)
예제 #23
0
    def createPickles(self):
        """For each file in fasta_filenames, call 'ice_partial.py one' to
        build clusters and to save results to a pickle file. When all pickles
        are done, union all pickles.
        """
        self.add_log("Mapping non-full-length reads to consensus isoforms.")
        self.add_log("Creating pickles...", level=logging.INFO)

        # using --blasr_nproc=4 because DALIGNER uses only 4 cores
        for idx, fa in enumerate(self.fasta_filenames):
            # for each splitted non-full-length reads fasta file, build #
            # partial_uc.pickle
            cmd = "ice_partial.py one {i} ".format(i=real_upath(fa)) + \
                  "{r} ".format(r=real_upath(self.ref_fasta)) + \
                  "{o} ".format(o=real_upath(self.pickle_filenames[idx])) + \
                  "--blasr_nproc={n} ".format(n=4) + \
                  "--done={d} ".format(d=real_upath(self.done_filenames[idx]))
            if self.ccs_fofn is not None:
                cmd += "--ccs_fofn={f} ".format(f=real_upath(self.ccs_fofn))
            if self.sa_file is not None:
                cmd += "--sa={sa} ".format(sa=real_upath(self.sa_file))

            self.add_log("Writing command to script {fsh}".
                         format(fsh=self.script_filenames[idx]))
            self.add_log("CMD: {0}".format(cmd))
            with open(self.script_filenames[idx], 'w') as fsh:
                fsh.write(cmd + "\n")

            # determine elog & olog
            partial_log_fn = op.join(self.log_dir,
                                     'IcePartial.{idx}'.format(idx=idx))
            elog = partial_log_fn + ".elog"
            olog = partial_log_fn + ".olog"
            jid = "ice_partial_{unique_id}_{name}".format(
                unique_id=self.sge_opts.unique_id,
                name=op.basename(fa))

            qsub_cmd = "qsub"
            if self.sge_opts.sge_queue is not None:
                qsub_cmd += " -q " + self.sge_opts.sge_queue
            qsub_cmd += " -pe {env} {n} ".format(env=self.sge_opts.sge_env_name, n=4) + \
                       "-cwd -S /bin/bash -V " + \
                       "-e {elog} ".format(elog=real_upath(elog)) + \
                       "-o {olog} ".format(olog=real_upath(olog)) + \
                       "-N {jid} ".format(jid=jid) + \
                       "{sh}".format(sh=real_upath(self.script_filenames[idx]))

            self.add_log("Creating a pickle for {f}".format(f=fa))

            if self.sge_opts.use_sge is True:
                self.qsub_cmd_and_log(qsub_cmd)
            else:
                cmd += " 1>{olog} 2>{elog}".format(olog=real_upath(olog),
                                                   elog=real_upath(elog))
                self.run_cmd_and_log(cmd=cmd, olog=olog, elog=elog)
예제 #24
0
def build_uc_from_partial(input_fasta, ref_fasta, out_pickle,
                          sa_file=None, ccs_fofn=None,
                          done_filename=None, blasr_nproc=12, use_finer_qv=False):
    """
    Given an input_fasta file of non-full-length (partial) reads and
    (unpolished) consensus isoforms sequences in ref_fasta, align reads to
    consensus isoforms using BLASR, and then build up a mapping between
    consensus isoforms and reads (i.e., assign reads to isoforms).
    Finally, save
        {isoform_id: [read_ids],
         nohit: set(no_hit_read_ids)}
    to an output pickle file.

    ccs_fofn --- If None, assume no quality value is available,
    otherwise, use QV from ccs_fofn.
    blasr_nproc --- equivalent to blasr -nproc, number of CPUs to use
    """
    input_fasta = realpath(input_fasta)
    m5_file = input_fasta + ".blasr"
    out_pickle = realpath(out_pickle)
    if sa_file is None:
        if op.exists(input_fasta + ".sa"):
            sa_file = input_fasta + ".sa"

    cmd = "blasr {i} ".format(i=real_upath(input_fasta)) + \
          "{r} -bestn 5 ".format(r=real_upath(ref_fasta)) + \
          "-nproc {n} -m 5 ".format(n=blasr_nproc) + \
          "-maxScore -1000 -minPctIdentity 85 " + \
          "-out {o} ".format(o=real_upath(m5_file))
    if sa_file is not None and op.exists(sa_file):
        cmd += "-sa {sa}".format(sa=real_upath(sa_file))

    logging.info("CMD: {cmd}".format(cmd=cmd))
    _out, _code, _msg = backticks(cmd)
    if _code != 0:
        errMsg = "Command failed: {cmd}\n{e}".format(cmd=cmd, e=_msg)
        logging.error(errMsg)
        raise RuntimeError(errMsg)
    
    if ccs_fofn is None:
        logging.info("Loading probability from model (0.01,0.07,0.06)")
        probqv = ProbFromModel(.01, .07, .06)
    else:
        start_t = time.time()
        if use_finer_qv:
            logging.info("Loading QVs from {i} + {f} took {s} secs".format(f=ccs_fofn, i=input_fasta,\
                    s=time.time()-start_t))
            probqv = ProbFromQV(input_fofn=ccs_fofn, fasta_filename=input_fasta)
        else:
            input_fastq = input_fasta[:input_fasta.rfind('.')] + '.fastq'
            logging.info("Converting {i} + {f} --> {fq}".format(i=input_fasta, f=ccs_fofn, fq=input_fastq))
            ice_fa2fq(input_fasta, ccs_fofn, input_fastq)
            logging.info("Loading QVs from {fq} took {s} secs".format(fq=input_fastq, s=time.time()-start_t))
            probqv = ProbFromFastq(input_fastq)


    logging.info("Calling blasr_against_ref ...")
    hitItems = blasr_against_ref(output_filename=m5_file,
                                 is_FL=False,
                                 sID_starts_with_c=True,
                                 qvmean_get_func=probqv.get_mean,
                                 qver_get_func=probqv.get_smoothed,
                                 ece_penalty=1,
                                 ece_min_len=20,
                                 same_strand_only=False,
                                 max_missed_start=200,
                                 max_missed_end=50)

    partial_uc = {}  # Maps each isoform (cluster) id to a list of reads
    # which can map to the isoform
    seen = set()  # reads seen
    logging.info("Building uc from BLASR hits.")
    for h in hitItems:
        if h.ece_arr is not None:
            if h.cID not in partial_uc:
                partial_uc[h.cID] = set()
            partial_uc[h.cID].add(h.qID)
            seen.add(h.qID)

    for k in partial_uc:
        partial_uc[k] = list(partial_uc[k])

    allhits = set(r.name.split()[0] for r in FastaReader(input_fasta))

    logging.info("Counting reads with no hit.")
    nohit = allhits.difference(seen)

    logging.info("Dumping uc to a pickle: {f}.".format(f=out_pickle))
    with open(out_pickle, 'w') as f:
        dump({'partial_uc': partial_uc, 'nohit': nohit}, f)

    os.remove(m5_file)

    done_filename = realpath(done_filename) if done_filename is not None \
        else out_pickle + '.DONE'
    logging.debug("Creating {f}.".format(f=done_filename))
    touch(done_filename)
예제 #25
0
def concat_sam(samfiles, outsam_filename):
    """
    Header looks like:
    @HD     VN:1.3.1
    @SQ     SN:c31  LN:3104 M5:ef7d3f84dea9d9face43e6fd5b6336c4
    @RG     ID:2caa54eef6   PU:in.raw_with_partial.fa       SM:NO_CHIP_ID
    @PG     ID:BLASR        VN:1.3.1.126469 CL:blasr in.raw_with_partial.fa g_consensus.fa -nproc 12 -bestn 5 -nCandidates 10 -sam -out out.sam

    NOTE: check for M5 conflicts; manipulate them if it conflicts
    """
    f_sq = open(outsam_filename + '.sq', 'w')
    f_bd = open(outsam_filename + '.bd', 'w')

    rg_line = None
    pg_line = None

    md5_seen = set()

    if len(samfiles) == 0:
        raise ValueError("No sam input files to concatenate.")

    h = open(samfiles[0])
    line = h.readline()
    assert line.startswith('@HD')
    f_sq.write(line)
    line = h.readline()
    assert line.startswith('@SQ')
    line = h.readline()
    assert line.startswith('@RG')
    rg_line = line  # write at the end
    line = h.readline()
    assert line.startswith('@PG')
    pg_line = line  # write at the end
    h.close()

    for f in samfiles:
        with open(f) as h:
            assert h.readline().startswith('@HD')
            line = h.readline()
            assert line.startswith('@SQ')
            # ------- check for MD5 conflicts ----------- #
            m5 = line.strip().split()[-1]
            assert m5.startswith("M5:")
            if m5 not in md5_seen:
                f_sq.write(line)
                md5_seen.add(m5)
            else:
                s = list(m5[3:])
                while True:
                    # create a random m5 string.
                    random.shuffle(s)
                    s = "".join(s)
                    if s not in md5_seen:
                        break
                line = line[:line.find('M5:')] + 'M5:' + s + '\n'
                logging.debug("MD5 conflict: change to {0}".format(s))
                md5_seen.add(s)
                f_sq.write(line)
            # ----- end MD5 checking and writing --------- #
            assert h.readline().startswith('@RG')
            assert h.readline().startswith('@PG')
            for line in h:
                f_bd.write(line)

    f_bd.close()
    f_sq.write(rg_line)
    f_sq.write(pg_line)
    f_sq.close()

    cmd = "cat {0}.sq {0}.bd > {0}".format(real_upath(outsam_filename))
    _out, _code, _msg = backticks(cmd)
    if _code != 0:
        raise IOError("Failed to concat sam files! Abort." + _msg)

    os.remove(f_sq.name)
    os.remove(f_bd.name)
예제 #26
0
def convert_fofn_to_fasta(fofn_filename, out_filename, fasta_out_dir,
                          force_overwrite=False, cpus=1):
    """
    For each .bax.h5 file, create .bax.h5.fasta file and save paths to
    out_filename, which should usually be 'input.fasta.fofn'
    """
    logging.info("Converting fofn {fofn} to fasta.".format(fofn=fofn_filename))
    in_fns = get_files_from_fofn(fofn_filename)
    #out_fns = []
    mkdir(fasta_out_dir)

    # multiprocessing worker stuff
    manager = Manager()
    in_queue = manager.Queue(len(in_fns))
    in_queue_count = 0
    outfile_track = {} # expected out file --> (cmd, tmp)
    pool = []
    out_fns = []

    for in_fn in in_fns:
        #print >> sys.stderr, "DEBUG: converting h5 file:", in_fn
        logging.debug("converting h5 file: {f}.".format(f=in_fn))
        if not (in_fn.endswith('.bax.h5') or in_fn.endswith('.bas.h5')):
            raise ValueError("fofn file {fofn} ".format(fofn=fofn_filename) +
                             "should only contain bax/bas.h5 files.")

        # e.g. m111xxxx.1.bax.h5 ==>
        #      tmp_out_file = m11xxxx.1.bax.h5.fasta.tmp
        #      out_file = m11xxxx.1.bax.h5.fasta
        in_basename = op.basename(in_fn)
        tmp_out_file = op.join(fasta_out_dir, in_basename + '.fasta.tmp')
        out_file = op.join(fasta_out_dir, in_basename + '.fasta')
        if op.exists(out_file) and not force_overwrite:
            logging.debug("File {0} already exists. skipping.".format(out_file))
            out_fns.append(out_file)
            if op.exists(tmp_out_file):
                os.remove(tmp_out_file)
        else:
            cmd = "pls2fasta {in_fn} ".format(in_fn=real_upath(in_fn)) + \
                  " {out} ".format(out=real_upath(tmp_out_file)) + \
                  "-minSubreadLength 300 -minReadScore 750 -trimByRegion"
            print >> sys.stderr, "DEBUG: putting in queue:", cmd, tmp_out_file, out_file
            in_queue.put((cmd, tmp_out_file, out_file))
            in_queue_count += 1
            outfile_track[out_file] = (cmd, tmp_out_file)
            print >> sys.stderr, "DEBUG: put in queue:", cmd, tmp_out_file, out_file

    cpus = min(cpus, in_queue_count) # cap max CPU if there's fewer files to convert
    for i in xrange(cpus):
        p = Process(target=convert_fofn_to_fasta_worker, args=(in_queue,))
        pool.append(p)

    #error_flag = False
    # starting & joining pool worakers
    for p in pool:
        p.start()
        #print >> sys.stderr, "Starting worker", p.name
    for p in pool:
        #print >> sys.stderr, "Waiting join", p.name
        p.join(timeout=1200)
        if p.is_alive(): p.terminate()

    # check that all files exists
    # if it does not, force to run locally
    for out_file,(cmd, tmp_out_file) in outfile_track.iteritems():
        in_queue.put((cmd, tmp_out_file, out_file))
        convert_fofn_to_fasta_worker(in_queue)
        out_fns.append(out_file)

    #if error_flag:
    #    raise Exception, "Unable to successfuly run convert_fofn_to_fasta, ABORT!"

    write_files_to_fofn(out_fns, out_filename)
예제 #27
0
    def quiver_cmds_for_bin(self, cids, quiver_nproc=2):
        """Return a list of quiver related cmds, to convert sam & ref to cmp.h5
        and call quiver, including samtoh5, loadPulses, comph5tools.py,
        samtools, loadChemistry, quiver...
        """
        first, last = cids[0], cids[-1]
        self.add_log("Creating quiver cmds for c{first} to c{last}".format(
            first=first, last=last))

        bin_sam_file = self.sam_of_quivered_bin(first, last)
        bin_ref_fa = self.ref_fa_of_quivered_bin(first, last)
        bin_cmph5 = self.cmph5_of_quivered_bin(first, last)
        bin_fq = self.fq_of_quivered_bin(first, last)

        cmds = []
        cmds.append("samtoh5 {sam} {ref} {cmph5} -smrtTitle".format(
            sam=real_upath(bin_sam_file),
            ref=real_upath(bin_ref_fa),
            cmph5=real_upath(bin_cmph5)))
        cmds.append("gzip {sam}".format(sam=real_upath(bin_sam_file)))
        metrics = [
            "QualityValue", "InsertionQV", "MergeQV", "DeletionQV",
            "DeletionTag", "SubstitutionTag", "SubstitutionQV"
        ]
        cmds.append("loadPulses {bas_fofn} ".format(
            bas_fofn=real_upath(self.bas_fofn)) +
                    "{cmph5} ".format(cmph5=real_upath(bin_cmph5)) +
                    "-byread -metrics " + ",".join(metrics))
        cmds.append(
            "cmph5tools.py sort {cmph5}".format(cmph5=real_upath(bin_cmph5)))
        cmds.append("samtools faidx {ref}".format(ref=real_upath(bin_ref_fa)))
        cmds.append("loadChemistry.py {bas_fofn} {cmph5}".format(
            bas_fofn=real_upath(self.bas_fofn), cmph5=real_upath(bin_cmph5)))
        cmds.append("quiver {cmph5} ".format(cmph5=real_upath(bin_cmph5)) +
                    "-v -j{n} ".format(n=quiver_nproc) +
                    "-r {ref} ".format(ref=real_upath(bin_ref_fa)) +
                    "-o {fq}".format(fq=real_upath(bin_fq)))
        return cmds