Exemplo n.º 1
0
 def __init__(self, input_fasta, reads_per_split, out_dir, out_prefix):
     self.input_fasta = input_fasta
     self.out_dir = out_dir
     self.reads_per_split = reads_per_split  # Number of reads per split
     self.out_prefix = out_prefix
     self.out_fns = None
     mkdir(self.out_dir)
Exemplo n.º 2
0
 def __init__(self, input_fasta, reads_per_split, out_dir, out_prefix):
     self.input_fasta = input_fasta
     self.out_dir = out_dir
     self.reads_per_split = reads_per_split  # Number of reads per split
     self.out_prefix = out_prefix
     self.out_fns = None
     mkdir(self.out_dir)
Exemplo n.º 3
0
    def _validate_inputs(self, root_dir, nfl_fa, N):
        """
        Check inputs, return
        (num_reads,
         number_reads_per_chunk,
         nfl_dir,
         [i-th_chunk_nfl_fa for i in [0...N-1]])
        """
        icef = IceFiles(prog_name="ice_partial_split", root_dir=root_dir, no_log_f=False)

        nfl_dir = icef.nfl_dir

        # root_dir/output/map_noFL/input.split_{0:02d}.fa
        splitted_nfl_fas = [icef.nfl_fa_i(i) for i in range(0, N)]

        mkdir(icef.nfl_dir)

        # Check if inputs exist.
        errMsg = ""
        if not nfs_exists(nfl_fa):
            errMsg = "The input non-full-length reads fasta file " + "{f} does not exists. ".format(f=nfl_fa)
        if len(errMsg) != 0:
            raise ValueError(errMsg)

        num_reads = num_reads_in_fasta(nfl_fa)
        reads_per_split = int(max(1, ceil(num_reads / N)))

        return (num_reads, reads_per_split, nfl_dir, splitted_nfl_fas)
Exemplo n.º 4
0
    def _validate_inputs(self, root_dir, nfl_fa, N):
        """
        Check inputs, return
        (num_reads,
         number_reads_per_chunk,
         nfl_dir,
         [i-th_chunk_nfl_fa for i in [0...N-1]])
        """
        icef = IceFiles(prog_name="ice_partial_split",
                        root_dir=root_dir,
                        no_log_f=False)

        nfl_dir = icef.nfl_dir

        # root_dir/output/map_noFL/input.split_{0:02d}.fa
        splitted_nfl_fas = [icef.nfl_fa_i(i) for i in range(0, N)]

        mkdir(icef.nfl_dir)

        # Check if inputs exist.
        errMsg = ""
        if not nfs_exists(nfl_fa):
            errMsg = ("The input non-full-length reads fasta file " +
                      "{f} does not exists. ".format(f=nfl_fa))
        if len(errMsg) != 0:
            raise ValueError(errMsg)

        num_reads = num_reads_in_fasta(nfl_fa)
        reads_per_split = int(max(1, ceil(num_reads / N)))

        return (num_reads, reads_per_split, nfl_dir, splitted_nfl_fas)
Exemplo n.º 5
0
    def run(self):
        """Run quiver for ICE."""
        # Create directories: root_dir/quivered and root_dir/log_dir/quivered
        mkdir(self.quivered_dir)
        mkdir(self.quivered_log_dir)

        files = get_files_from_fofn(self.fasta_fofn)
        msg = "Indexing {0} fasta files, please wait.".format(len(files))
        self.add_log(msg)

        d = MetaSubreadFastaReader(files)
        self.add_log("Fasta files indexing done.")

        self.add_log("Loading uc from {f}.".format(f=self.final_pickle_fn))
        a = load(open(self.final_pickle_fn))
        uc = a['uc']
        refs = a['refs']

        self.add_log("Loading partial uc from {f}.".
                     format(f=self.nfl_all_pickle_fn))
        partial_uc = load(open(self.nfl_all_pickle_fn))['partial_uc']
        partial_uc2 = defaultdict(lambda: [])
        partial_uc2.update(partial_uc)

        # Write report to quivered/cluster_report.FL_nonFL.csv
        self.add_log("Writing a csv report of cluster -> FL/NonFL reads to {f}".
                     format(f=self.report_fn), level=logging.INFO)
        self.write_report(uc=uc, partial_uc=partial_uc2,
                          report_fn=self.report_fn)

        good = [x for x in uc] #[x for x in uc if len(uc[x]) > 1 or len(partial_uc2[x]) >= 10]
        keys = sorted(good)  # sort good keys (cluster ids)

        start = 0
        end = len(keys)

        submitted = []  # submitted jobs
        todo = []       # to-do jobs

        self.submit_quiver_jobs(d=d, uc=uc, partial_uc=partial_uc2,
            refs=refs, keys=keys, start=start, end=end,
            submitted=submitted, todo=todo,
            use_sge=self.sge_opts.use_sge,
            max_sge_jobs=self.sge_opts.max_sge_jobs,
            quiver_nproc=self.sge_opts.quiver_nproc)

        with open(self.submitted_quiver_jobs_log, 'w') as f:
            f.write("\n".join(str(x[0]) + '\t' + str(x[1]) for x in submitted))

        self.close_log()
        return 0
Exemplo n.º 6
0
    def validate_inputs(self):
        """Validate input fofns, and root_dir, log_dir, tmp_dir,
        create quivered_dir and quivered_log_dir"""
        self.add_log("Validating inputs.")

        # Create directories: root_dir/quivered and root_dir/log_dir/quivered
        try:
            mkdir(self.quivered_dir)
            mkdir(self.quivered_log_dir)
        except OSError:
            # Multiple ice_quiver_i jobs may run at the same time and try to
            # mkdir, race condition may happen, so ignore OSError here.
            pass

        errMsg = ""

        if not nfs_exists(self.log_dir) or not op.isdir(self.log_dir):
            errMsg = "Log dir {l} is not an existing directory.".\
                format(l=self.log_dir)
        elif self.bas_fofn is None:
            errMsg = "Please specify bas_fofn (e.g. input.fofn)."
        elif not nfs_exists(self.bas_fofn):
            errMsg = "bas_fofn {f} ".format(f=self.bas_fofn) + \
                     "which contains bas/bax.h5 files does not exist."
        elif self.fasta_fofn is None:
            errMsg = "Please make sure ice_make_fasta_fofn has " + \
                     "been called, and specify fasta_fofn."
        elif not nfs_exists(self.fasta_fofn):
            errMsg = "Input fasta_fofn {f} does not exists.".\
                     format(f=self.fasta_fofn)
            fasta_files = get_files_from_fofn(self.fasta_fofn)
            for fasta_file in fasta_files:
                if not nfs_exists(fasta_file):
                    errMsg = "A file {f} in fasta_fofn does not exist.".\
                             format(f=fasta_file)
        elif not nfs_exists(self.nfl_all_pickle_fn):
            #"output/map_noFL/noFL.ALL.partial_uc.pickle"):
            errMsg = "Pickle file {f} ".format(f=self.nfl_all_pickle_fn) + \
                     "which assigns all non-full-length reads to isoforms " + \
                     "does not exist. Please check 'ice_partial.py *' are " + \
                     "all done."
        elif not nfs_exists(self.final_pickle_fn):
            errMsg = "Pickle file {f} ".format(f=self.final_pickle_fn) + \
                     "which assigns full-length non-chimeric reads to " + \
                     "isoforms does not exist."

        if errMsg != "":
            self.add_log(errMsg, level=logging.ERROR)
            raise IOError(errMsg)
Exemplo n.º 7
0
    def validate_inputs(self):
        """Validate input fofns, and root_dir, log_dir, tmp_dir,
        create quivered_dir and quivered_log_dir"""
        self.add_log("Validating inputs.")

        # Create directories: root_dir/quivered and root_dir/log_dir/quivered
        try:
            mkdir(self.quivered_dir)
            mkdir(self.quivered_log_dir)
        except OSError:
            # Multiple ice_quiver_i jobs may run at the same time and try to
            # mkdir, race condition may happen, so ignore OSError here.
            pass

        errMsg = ""

        if not nfs_exists(self.log_dir) or not op.isdir(self.log_dir):
            errMsg = "Log dir {l} is not an existing directory.".\
                format(l=self.log_dir)
        elif self.bas_fofn is None:
            errMsg = "Please specify bas_fofn (e.g. input.fofn)."
        elif not nfs_exists(self.bas_fofn):
            errMsg = "bas_fofn {f} ".format(f=self.bas_fofn) + \
                     "which contains bas/bax.h5 files does not exist."
        elif self.fasta_fofn is None:
            errMsg = "Please make sure ice_make_fasta_fofn has " + \
                     "been called, and specify fasta_fofn."
        elif not nfs_exists(self.fasta_fofn):
            errMsg = "Input fasta_fofn {f} does not exists.".\
                     format(f=self.fasta_fofn)
            fasta_files = get_files_from_fofn(self.fasta_fofn)
            for fasta_file in fasta_files:
                if not nfs_exists(fasta_file):
                    errMsg = "A file {f} in fasta_fofn does not exist.".\
                             format(f=fasta_file)
        elif not nfs_exists(self.nfl_all_pickle_fn):
            #"output/map_noFL/noFL.ALL.partial_uc.pickle"):
            errMsg = "Pickle file {f} ".format(f=self.nfl_all_pickle_fn) + \
                     "which assigns all non-full-length reads to isoforms " + \
                     "does not exist. Please check 'ice_partial.py *' are " + \
                     "all done."
        elif not nfs_exists(self.final_pickle_fn):
            errMsg = "Pickle file {f} ".format(f=self.final_pickle_fn) + \
                     "which assigns full-length non-chimeric reads to " + \
                     "isoforms does not exist."

        if errMsg != "":
            self.add_log(errMsg, level=logging.ERROR)
            raise IOError(errMsg)
Exemplo n.º 8
0
    def __init__(self,
                 root_dir,
                 fasta_filenames,
                 ref_fasta,
                 out_pickle,
                 sge_opts,
                 sa_file=None,
                 ccs_fofn=None):
        """
        fasta_filenames --- a list of splitted nfl fasta files.

        ref_fasta --- (unpolished) consensus isoforms

        out_pickle --- a pickle file with all nfl fasta reads

        ccs_fofn --- should be reads_of_insert.fofn or None

        root_dir --- ICE root output directory

        sge_opts --- params for SGE environment, including
            use_sge    : use SGE or not
            max_sge_jobs: maximum number of gcon jobs submitted
            unique_id  : unique qsub job id, important that this
                        DOES NOT CONFLICT!
            blasr_nproc: blasr -nproc param, number of threads per cpu.
            gcon_nproc : number of gcon that can run at the same time
        """
        self.prog_name = "IceAllPartials"
        IceFiles.__init__(self, prog_name=self.prog_name, root_dir=root_dir)

        self.fasta_filenames, self.ref_fasta, self.ccs_fofn, self.sa_file = \
            self._validateInputs(fasta_filenames=fasta_filenames,
                                 ref_fasta=ref_fasta,
                                 ccs_fofn=ccs_fofn,
                                 sa_file=sa_file)

        self.out_pickle = out_pickle

        self.sge_opts = sge_opts

        self.add_log("Making dir for mapping noFL reads: " + self.nfl_dir)
        mkdir(self.nfl_dir)

        self.add_log("input fasta files are: " +
                     ", ".join(self.fasta_filenames))
        self.add_log("temp pickle files are: " +
                     ", ".join(self.pickle_filenames))
        self.add_log("out pickle file is: " + self.out_pickle)
Exemplo n.º 9
0
    def __init__(self,
                 prog_name,
                 root_dir,
                 bas_fofn=None,
                 ccs_fofn=None,
                 fasta_fofn=None,
                 no_log_f=False):
        """
        prog_name --- name of a sub-class
        root_dir --- root directory of the whole project. There will be
                     sub-directories under it, including:
                     tmp/ --- 0/  c0, c1, ..., c9999
                          --- 1/  c10000, c10001, ..., c19999
                          ...
                          each c? folder contains data for a cluster id=c?
                     script/
                          --- 0/  gcon_job_?.sh, gcon jobs in the first iteration
                          --- 1/  gcon_job_?.sh, gcon jobs in the second iteration
                          ...
                     log/
                          --- ICE.log   Log of the ICE algorithm
                          --- 0/  log for jobs in the first iteration
                          ...
                     output/   output files go here.
        bas_fofn --- input.fofn which contains movie.bas|bax.h5 files.
        ccs_fofn --- a fofn contains movie.ccs.h5 files.
        fasta_fofn --- a fofn contains movie.bax.h5.fasta files.
                     script/
        no_log_f --- DON'T write log to a log file.
        """
        self.prog_name = str(prog_name)
        self.root_dir = real_ppath(root_dir)

        self.bas_fofn = real_ppath(bas_fofn)
        self.ccs_fofn = real_ppath(ccs_fofn)
        self.fasta_fofn = real_ppath(fasta_fofn)

        mkdir(self.root_dir)
        mkdir(self.tmp_dir)
        mkdir(self.log_dir)
        mkdir(self.script_dir)
        mkdir(self.out_dir)

        self.no_log_f = no_log_f
        if not no_log_f:
            self.log_f = open(self.log_fn, 'w', 0)
            self.add_log(msg="Initializing {p}.".format(p=self.prog_name))
Exemplo n.º 10
0
    def __init__(self, prog_name, root_dir,
                 bas_fofn=None, ccs_fofn=None, fasta_fofn=None,
                 no_log_f=False):
        """
        prog_name --- name of a sub-class
        root_dir --- root directory of the whole project. There will be
                     sub-directories under it, including:
                     tmp/ --- 0/  c0, c1, ..., c9999
                          --- 1/  c10000, c10001, ..., c19999
                          ...
                          each c? folder contains data for a cluster id=c?
                     script/
                          --- 0/  gcon_job_?.sh, gcon jobs in the first iteration
                          --- 1/  gcon_job_?.sh, gcon jobs in the second iteration
                          ...
                     log/
                          --- ICE.log   Log of the ICE algorithm
                          --- 0/  log for jobs in the first iteration
                          ...
                     output/   output files go here.
        bas_fofn --- input.fofn which contains movie.bas|bax.h5 files.
        ccs_fofn --- a fofn contains movie.ccs.h5 files.
        fasta_fofn --- a fofn contains movie.bax.h5.fasta files.
                     script/
        no_log_f --- DON'T write log to a log file.
        """
        self.prog_name = str(prog_name)
        self.root_dir = real_ppath(root_dir)

        self.bas_fofn = real_ppath(bas_fofn)
        self.ccs_fofn = real_ppath(ccs_fofn)
        self.fasta_fofn = real_ppath(fasta_fofn)

        mkdir(self.root_dir)
        mkdir(self.tmp_dir)
        mkdir(self.log_dir)
        mkdir(self.script_dir)
        mkdir(self.out_dir)

        self.no_log_f = no_log_f
        if not no_log_f:
            self.log_f = open(self.log_fn, 'w', 0)
            self.add_log(msg="Initializing {p}.".format(p=self.prog_name))
Exemplo n.º 11
0
    def __init__(self, root_dir, fastq_filenames, ref_fasta,
                 out_pickle, sge_opts, sa_file=None, ccs_fofn=None):
        """
        fastq_filenames --- a list of splitted nfl fastq files.

        ref_fasta --- (unpolished) consensus isoforms

        out_pickle --- a pickle file with all nfl fasta reads

        ccs_fofn --- should be reads_of_insert.fofn or None

        root_dir --- ICE root output directory

        sge_opts --- params for SGE environment, including
            use_sge    : use SGE or not
            max_sge_jobs: maximum number of sub-jobs submitted
            unique_id  : unique qsub job id, important that this
                        DOES NOT CONFLICT!
            blasr_nproc: blasr -nproc param, number of threads per cpu.
        """
        self.prog_name = "IceAllPartials"
        IceFiles.__init__(self, prog_name=self.prog_name, root_dir=root_dir)

        self.add_log("DEBUG: in IceAllPartials, ccs_fofn is {0}.".format(ccs_fofn), level=logging.INFO)

        self.fastq_filenames, self.ref_fasta, self.ccs_fofn, self.sa_file = \
            self._validate_inputs(fastq_filenames=fastq_filenames,
                                  ref_fasta=ref_fasta,
                                  ccs_fofn=ccs_fofn,
                                  sa_file=sa_file)

        self.out_pickle = out_pickle

        self.sge_opts = sge_opts

        self.add_log("Making dir for mapping noFL reads: " + self.nfl_dir)
        mkdir(self.nfl_dir)

        self.add_log("input fasta files are: " +
                     ", ".join(self.fastq_filenames))
        self.add_log("temp pickle files are: " +
                     ", ".join(self.pickle_filenames))
        self.add_log("out pickle file is: " + self.out_pickle)
Exemplo n.º 12
0
def convert_fofn_to_fasta(fofn_filename,
                          out_filename,
                          fasta_out_dir,
                          force_overwrite=False):
    """
    For each .bax.h5 file, create .bax.h5.fasta file and save paths to
    out_filename, which should usually be 'input.fasta.fofn'
    """
    logging.info("Converting fofn {fofn} to fasta.".format(fofn=fofn_filename))
    in_fns = get_files_from_fofn(fofn_filename)
    out_fns = []
    mkdir(fasta_out_dir)
    for in_fn in in_fns:
        logging.debug("converting h5 file: {f}.".format(f=in_fn))
        if not (in_fn.endswith('.bax.h5') or in_fn.endswith('.bas.h5')):
            raise ValueError("fofn file {fofn} ".format(fofn=fofn_filename) +
                             "should only contain bax/bas.h5 files.")

        # e.g. m111xxxx.1.bax.h5 ==>
        #      tmp_out_file = m11xxxx.1.bax.h5.fasta.tmp
        #      out_file = m11xxxx.1.bax.h5.fasta
        in_basename = op.basename(in_fn)
        tmp_out_file = op.join(fasta_out_dir, in_basename + '.fasta.tmp')
        out_file = op.join(fasta_out_dir, in_basename + '.fasta')
        if op.exists(out_file) and not force_overwrite:
            logging.debug(
                "File {0} already exists. skipping.".format(out_file))
        else:
            cmd = "pls2fasta {0} {1} ".format(in_fn, tmp_out_file) + \
                  "-minSubreadLength 300 -minReadScore 750 -trimByRegion"
            logging.debug("CMD: {cmd}".format(cmd=cmd))
            _out, _code, _msg = backticks(cmd)
            if _code != 0:
                raise RuntimeError("CMD failed: {cmd}\n".format(cmd=cmd) +
                                   _msg)
            trim_subread_flanks(tmp_out_file, out_file)
        out_fns.append(out_file)
        if op.exists(tmp_out_file):
            os.remove(tmp_out_file)
    write_files_to_fofn(out_fns, out_filename)
Exemplo n.º 13
0
def convert_fofn_to_fasta(fofn_filename, out_filename, fasta_out_dir,
                          force_overwrite=False):
    """
    For each .bax.h5 file, create .bax.h5.fasta file and save paths to
    out_filename, which should usually be 'input.fasta.fofn'
    """
    logging.info("Converting fofn {fofn} to fasta.".format(fofn=fofn_filename))
    in_fns = get_files_from_fofn(fofn_filename)
    out_fns = []
    mkdir(fasta_out_dir)
    for in_fn in in_fns:
        logging.debug("converting h5 file: {f}.".format(f=in_fn))
        if not (in_fn.endswith('.bax.h5') or in_fn.endswith('.bas.h5')):
            raise ValueError("fofn file {fofn} ".format(fofn=fofn_filename) +
                             "should only contain bax/bas.h5 files.")

        # e.g. m111xxxx.1.bax.h5 ==>
        #      tmp_out_file = m11xxxx.1.bax.h5.fasta.tmp
        #      out_file = m11xxxx.1.bax.h5.fasta
        in_basename = op.basename(in_fn)
        tmp_out_file = op.join(fasta_out_dir, in_basename + '.fasta.tmp')
        out_file = op.join(fasta_out_dir, in_basename + '.fasta')
        if op.exists(out_file) and not force_overwrite:
            logging.debug("File {0} already exists. skipping.".format(out_file))
        else:
            cmd = "pls2fasta {0} {1} ".format(in_fn, tmp_out_file) + \
                  "-minSubreadLength 300 -minReadScore 750 -trimByRegion"
            logging.debug("CMD: {cmd}".format(cmd=cmd))
            _out, _code, _msg = backticks(cmd)
            if _code != 0:
                raise RuntimeError("CMD failed: {cmd}\n".format(cmd=cmd) + _msg)
            trim_subread_flanks(tmp_out_file, out_file)
        out_fns.append(out_file)
        if op.exists(tmp_out_file):
            os.remove(tmp_out_file)
    write_files_to_fofn(out_fns, out_filename)
Exemplo n.º 14
0
def convert_fofn_to_fasta(fofn_filename, out_filename, fasta_out_dir,
                          force_overwrite=False, cpus=1):
    """
    For each .bax.h5 file, create .bax.h5.fasta file and save paths to
    out_filename, which should usually be 'input.fasta.fofn'
    """
    logging.info("Converting fofn {fofn} to fasta.".format(fofn=fofn_filename))
    in_fns = get_files_from_fofn(fofn_filename)
    #out_fns = []
    mkdir(fasta_out_dir)

    # multiprocessing worker stuff
    manager = Manager()
    in_queue = manager.Queue(len(in_fns))
    in_queue_count = 0
    outfile_track = {} # expected out file --> (cmd, tmp)
    pool = []
    out_fns = []

    for in_fn in in_fns:
        #print >> sys.stderr, "DEBUG: converting h5 file:", in_fn
        logging.debug("converting h5 file: {f}.".format(f=in_fn))
        if not (in_fn.endswith('.bax.h5') or in_fn.endswith('.bas.h5')):
            raise ValueError("fofn file {fofn} ".format(fofn=fofn_filename) +
                             "should only contain bax/bas.h5 files.")

        # e.g. m111xxxx.1.bax.h5 ==>
        #      tmp_out_file = m11xxxx.1.bax.h5.fasta.tmp
        #      out_file = m11xxxx.1.bax.h5.fasta
        in_basename = op.basename(in_fn)
        tmp_out_file = op.join(fasta_out_dir, in_basename + '.fasta.tmp')
        out_file = op.join(fasta_out_dir, in_basename + '.fasta')
        if op.exists(out_file) and not force_overwrite:
            logging.debug("File {0} already exists. skipping.".format(out_file))
            out_fns.append(out_file)
            if op.exists(tmp_out_file):
                os.remove(tmp_out_file)
        else:
            cmd = "pls2fasta {in_fn} ".format(in_fn=real_upath(in_fn)) + \
                  " {out} ".format(out=real_upath(tmp_out_file)) + \
                  "-minSubreadLength 300 -minReadScore 750 -trimByRegion"
            print >> sys.stderr, "DEBUG: putting in queue:", cmd, tmp_out_file, out_file
            in_queue.put((cmd, tmp_out_file, out_file))
            in_queue_count += 1
            outfile_track[out_file] = (cmd, tmp_out_file)
            print >> sys.stderr, "DEBUG: put in queue:", cmd, tmp_out_file, out_file

    cpus = min(cpus, in_queue_count) # cap max CPU if there's fewer files to convert
    for i in xrange(cpus):
        p = Process(target=convert_fofn_to_fasta_worker, args=(in_queue,))
        pool.append(p)

    #error_flag = False
    # starting & joining pool worakers
    for p in pool:
        p.start()
        #print >> sys.stderr, "Starting worker", p.name
    for p in pool:
        #print >> sys.stderr, "Waiting join", p.name
        p.join(timeout=1200)
        if p.is_alive(): p.terminate()

    # check that all files exists
    # if it does not, force to run locally
    for out_file,(cmd, tmp_out_file) in outfile_track.iteritems():
        in_queue.put((cmd, tmp_out_file, out_file))
        convert_fofn_to_fasta_worker(in_queue)
        out_fns.append(out_file)

    #if error_flag:
    #    raise Exception, "Unable to successfuly run convert_fofn_to_fasta, ABORT!"

    write_files_to_fofn(out_fns, out_filename)
Exemplo n.º 15
0
    def run(self):
        """Run quiver for ICE."""
        # Create directories: root_dir/quivered and root_dir/log_dir/quivered
        mkdir(self.quivered_dir)
        mkdir(self.quivered_log_dir)

        files = get_files_from_fofn(self.fasta_fofn)
        msg = "Indexing {0} fasta files, please wait.".format(len(files))
        self.add_log(msg)

        d = MetaSubreadFastaReader(files)
        self.add_log("Fasta files indexing done.")

        self.add_log("Loading uc from {f}.".format(f=self.final_pickle_fn))
        a = load(open(self.final_pickle_fn))
        uc = a['uc']
        refs = a['refs']

        self.add_log(
            "Loading partial uc from {f}.".format(f=self.nfl_all_pickle_fn))
        partial_uc = load(open(self.nfl_all_pickle_fn))['partial_uc']
        partial_uc2 = defaultdict(lambda: [])
        partial_uc2.update(partial_uc)

        # Write report to quivered/cluster_report.FL_nonFL.csv
        self.add_log(
            "Writing a csv report of cluster -> FL/NonFL reads to {f}".format(
                f=self.report_fn),
            level=logging.INFO)
        self.write_report(uc=uc,
                          partial_uc=partial_uc2,
                          report_fn=self.report_fn)

        good = [
            x for x in uc
        ]  #[x for x in uc if len(uc[x]) > 1 or len(partial_uc2[x]) >= 10]
        keys = sorted(good)  # sort good keys (cluster ids)

        start = 0
        end = len(keys)

        submitted = []  # submitted jobs
        todo = []  # to-do jobs

        self.submit_quiver_jobs(d=d,
                                uc=uc,
                                partial_uc=partial_uc2,
                                refs=refs,
                                keys=keys,
                                start=start,
                                end=end,
                                submitted=submitted,
                                todo=todo,
                                use_sge=self.sge_opts.use_sge,
                                max_sge_jobs=self.sge_opts.max_sge_jobs,
                                quiver_nproc=self.sge_opts.quiver_nproc)

        with open(self.submitted_quiver_jobs_log, 'w') as f:
            f.write("\n".join(str(x[0]) + '\t' + str(x[1]) for x in submitted))

        self.close_log()
        return 0
Exemplo n.º 16
0
def convert_fofn_to_fasta(fofn_filename, out_filename, fasta_out_dir,
                          force_overwrite=False, cpus=1):
    """
    For each .bax.h5 file, create .bax.h5.fasta file and save paths to
    out_filename, which should usually be 'input.fasta.fofn'
    """
    logging.info("Converting fofn {fofn} to fasta.".format(fofn=fofn_filename))
    in_fns = get_files_from_fofn(fofn_filename)
    #out_fns = []
    mkdir(fasta_out_dir)

    # multiprocessing worker stuff
    manager = Manager()
    in_queue = manager.Queue(len(in_fns))
    in_queue_count = 0
    outfile_track = {} # expected out file --> (cmd, tmp)
    pool = []
    out_fns = []

    for in_fn in in_fns:
        #print >> sys.stderr, "DEBUG: converting h5 file:", in_fn
        logging.debug("converting h5 file: {f}.".format(f=in_fn))
        if not (in_fn.endswith('.bax.h5') or in_fn.endswith('.bas.h5')):
            raise ValueError("fofn file {fofn} ".format(fofn=fofn_filename) +
                             "should only contain bax/bas.h5 files.")

        # e.g. m111xxxx.1.bax.h5 ==>
        #      tmp_out_file = m11xxxx.1.bax.h5.fasta.tmp
        #      out_file = m11xxxx.1.bax.h5.fasta
        in_basename = op.basename(in_fn)
        tmp_out_file = op.join(fasta_out_dir, in_basename + '.fasta.tmp')
        out_file = op.join(fasta_out_dir, in_basename + '.fasta')
        if op.exists(out_file) and not force_overwrite:
            logging.debug("File {0} already exists. skipping.".format(out_file))
            out_fns.append(out_file)
            if op.exists(tmp_out_file):
                os.remove(tmp_out_file)
        else:
            cmd = "pls2fasta {in_fn} ".format(in_fn=real_upath(in_fn)) + \
                  " {out} ".format(out=real_upath(tmp_out_file)) + \
                  "-minSubreadLength 300 -minReadScore 750 -trimByRegion"
            print >> sys.stderr, "DEBUG: putting in queue:", cmd, tmp_out_file, out_file
            in_queue.put((cmd, tmp_out_file, out_file))
            in_queue_count += 1
            outfile_track[out_file] = (cmd, tmp_out_file)
            print >> sys.stderr, "DEBUG: put in queue:", cmd, tmp_out_file, out_file

    cpus = min(cpus, in_queue_count) # cap max CPU if there's fewer files to convert
    for i in xrange(cpus):
        p = Process(target=convert_fofn_to_fasta_worker, args=(in_queue,))
        pool.append(p)

    #error_flag = False
    # starting & joining pool worakers
    for p in pool:
        p.start()
        #print >> sys.stderr, "Starting worker", p.name
    for p in pool:
        #print >> sys.stderr, "Waiting join", p.name
        p.join(timeout=1200)
        if p.is_alive(): p.terminate()

    # check that all files exists
    # if it does not, force to run locally
    for out_file,(cmd, tmp_out_file) in outfile_track.iteritems():
        in_queue.put((cmd, tmp_out_file, out_file))
        convert_fofn_to_fasta_worker(in_queue)
        out_fns.append(out_file)

    #if error_flag:
    #    raise Exception, "Unable to successfuly run convert_fofn_to_fasta, ABORT!"

    write_files_to_fofn(out_fns, out_filename)
Exemplo n.º 17
0
def convert_fofn_to_fasta(fofn_filename,
                          out_filename,
                          fasta_out_dir,
                          force_overwrite=False,
                          cpus=1):
    """
    For each .bax.h5 file, create .bax.h5.fasta file and save paths to
    out_filename, which should usually be 'input.fasta.fofn'
    """
    logging.info("Converting fofn {fofn} to fasta.".format(fofn=fofn_filename))
    in_fns = get_files_from_fofn(fofn_filename)
    #out_fns = []
    mkdir(fasta_out_dir)

    # multiprocessing worker stuff
    manager = Manager()
    out_fns = manager.list()
    in_queue = manager.Queue(99999)
    pool = []
    for i in xrange(cpus):
        p = Process(target=convert_fofn_to_fasta_worker,
                    args=(in_queue, out_fns))
        pool.append(p)

    for in_fn in in_fns:
        #print >> sys.stderr, "DEBUG: converting h5 file:", in_fn
        logging.debug("converting h5 file: {f}.".format(f=in_fn))
        if not (in_fn.endswith('.bax.h5') or in_fn.endswith('.bas.h5')):
            raise ValueError("fofn file {fofn} ".format(fofn=fofn_filename) +
                             "should only contain bax/bas.h5 files.")

        # e.g. m111xxxx.1.bax.h5 ==>
        #      tmp_out_file = m11xxxx.1.bax.h5.fasta.tmp
        #      out_file = m11xxxx.1.bax.h5.fasta
        in_basename = op.basename(in_fn)
        tmp_out_file = op.join(fasta_out_dir, in_basename + '.fasta.tmp')
        out_file = op.join(fasta_out_dir, in_basename + '.fasta')
        if op.exists(out_file) and not force_overwrite:
            logging.debug(
                "File {0} already exists. skipping.".format(out_file))
            out_fns.append(out_file)
            if op.exists(tmp_out_file):
                os.remove(tmp_out_file)
        else:
            cmd = "pls2fasta {in_fn} ".format(in_fn=real_upath(in_fn)) + \
                  " {out} ".format(out=real_upath(tmp_out_file)) + \
                  "-minSubreadLength 300 -minReadScore 750 -trimByRegion"
            print >> sys.stderr, "DEBUG: putting in queue:", cmd, tmp_out_file, out_file
            in_queue.put((cmd, tmp_out_file, out_file))
            print >> sys.stderr, "DEBUG: put in queue:", cmd, tmp_out_file, out_file


#            logging.debug("CMD: {cmd}".format(cmd=cmd))
#            _out, _code, _msg = backticks(cmd)
#            if _code != 0:
#                raise RuntimeError("CMD failed: {cmd}\n".format(cmd=cmd) + _msg)
#            trim_subread_flanks(tmp_out_file, out_file)
#        out_fns.append(out_file)
#        if op.exists(tmp_out_file):
#            os.remove(tmp_out_file)

# starting & joining pool worakers
    for p in pool:
        p.start()
        #print >> sys.stderr, "Starting worker", p.name
    for p in pool:
        #print >> sys.stderr, "Waiting join", p.name
        p.join()

    write_files_to_fofn(out_fns, out_filename)