def run(self): """Run""" logging.debug("root_dir: {d}.".format(d=self.root_dir)) logging.debug("nfl_fa: {f}.".format(f=self.nfl_fa)) logging.debug("Total number of chunks: N={N}.".format(N=self.N)) # Validate input files, (num_reads, reads_per_split, nfl_dir, splitted_fas_todo) = \ self.validate_inputs() logging.info("Total number of reads is {n}.".format(n=num_reads)) logging.info("Splitting nfl_fa into chunks each " + "containing {n} reads.".format(n=reads_per_split)) splitted_fas_done = splitFasta(input_fasta=real_ppath(self.nfl_fa), reads_per_split=reads_per_split, out_dir=nfl_dir, out_prefix="input.split") logging.info("Splitted files are: " + "\n".join(splitted_fas_done)) for fa in splitted_fas_todo: if fa not in splitted_fas_done: logging.info("touching {f}".format(f=fa)) touch(fa)
def run(self): """Run""" logging.debug("root_dir: {d}.".format(d=self.root_dir)) logging.debug("nfl_fa: {f}.".format(f=self.nfl_fa)) logging.debug("Total number of chunks: N={N}.".format(N=self.N)) # Validate input files, (num_reads, reads_per_split, nfl_dir, splitted_fas_todo) = self.validate_inputs() logging.info("Total number of reads is {n}.".format(n=num_reads)) logging.info("Splitting nfl_fa into chunks each " + "containing {n} reads.".format(n=reads_per_split)) splitted_fas_done = splitFasta( input_fasta=real_ppath(self.nfl_fa), reads_per_split=reads_per_split, out_dir=nfl_dir, out_prefix="input.split", ) logging.info("Splitted files are: " + "\n".join(splitted_fas_done)) for fa in splitted_fas_todo: if fa not in splitted_fas_done: logging.info("touching {f}".format(f=fa)) touch(fa)
def run(self): """ First, split non-full-length (nfl) fasta files into smaller chunks, assign nfl reads in each splitted fasta file into unpolished isoform clusters and then merge all pickles into self.nfl_all_pickle_fn. Second, bin every 100 clusters, for each bin, call blasr, samto5h, loadPulses, cmph5tools to create cmp.h5 files and call quiver to polish each isoforms within each bin. Finally, pick up good isoform clusters whose QV errors is less than a threshold. Save all high quality isoforms to hq_isoforms_fa|fq if they are not None Save all low quality isoforms to lq_isoforms_fa|fq if they are not None """ # Create final.consensus.fa.sa self.add_log("Generating suffix array for {f}".format( f=self.final_consensus_sa), level=logging.INFO) sa_file = self.get_sa_file() # Create input.fasta.fofn from bas_fofn self.add_log("Creating fasta fofn from bas/bax.h5 fofn", level=logging.INFO) if self.fasta_fofn is None: self.fasta_fofn = op.join(self.nfl_dir, "input.fasta.fofn") self.add_log("bas fofn={f}".format(f=self.bas_fofn)) self.add_log("fasta fofn={f}".format(f=self.fasta_fofn)) convert_fofn_to_fasta(fofn_filename=self.bas_fofn, out_filename=self.fasta_fofn, fasta_out_dir=self.nfl_dir) # Split non-full-length reads into smaller fasta files # and save files to root_dir/nfl_00.fa, ..., . self.add_log("Splitting {nfl} into ".format(nfl=self.nfl_fa) + "smaller files each containing {n} reads.".format( n=self.ice_opts.nfl_reads_per_split), level=logging.INFO) self._nfl_splitted_fas = splitFasta( input_fasta=self.nfl_fa, reads_per_split=self.ice_opts.nfl_reads_per_split, out_dir=self.nfl_dir, out_prefix="input.split") msg = "Splitted files are: " + "\n".join(self._nfl_splitted_fas) self.add_log(msg, level=logging.INFO) # Process nfl reads in each splitted fasta. self.add_log("IceAllPartials initiated.", level=logging.INFO) sa_file = self.final_consensus_sa \ if op.exists(self.final_consensus_fa) else None self.icep = IceAllPartials(root_dir=self.root_dir, fasta_filenames=self._nfl_splitted_fas, ref_fasta=self.final_consensus_fa, out_pickle=self.nfl_all_pickle_fn, sge_opts=self.sge_opts, sa_file=sa_file, ccs_fofn=self.ccs_fofn) self.icep.run() self.add_log("IceAllPartials completed.", level=logging.INFO) self.add_log("IceQuiver initiated.", level=logging.INFO) self.iceq = IceQuiver(root_dir=self.root_dir, bas_fofn=self.bas_fofn, fasta_fofn=self.fasta_fofn, sge_opts=self.sge_opts) self.iceq.run() self.add_log("IceQuiver finished.", level=logging.INFO) self.add_log("IcePostQuiver initiated.", level=logging.INFO) self.icepq = IcePostQuiver(root_dir=self.root_dir, hq_isoforms_fa=self.hq_isoforms_fa, hq_isoforms_fq=self.hq_isoforms_fq, lq_isoforms_fa=self.lq_isoforms_fa, lq_isoforms_fq=self.lq_isoforms_fq, use_sge=self.sge_opts.use_sge, quit_if_not_done=False) self.icepq.run() self.add_log("IcePostQuiver finished.", level=logging.INFO)
def run(self): """ First, split non-full-length (nfl) fasta files into smaller chunks, assign nfl reads in each splitted fasta file into unpolished isoform clusters and then merge all pickles into self.nfl_all_pickle_fn. Second, bin every 100 clusters, for each bin, call blasr, samto5h, loadPulses, cmph5tools to create cmp.h5 files and call quiver to polish each isoforms within each bin. Finally, pick up good isoform clusters whose QV errors is less than a threshold. Save all high quality isoforms to hq_isoforms_fa|fq if they are not None Save all low quality isoforms to lq_isoforms_fa|fq if they are not None """ # Create final.consensus.fa.sa #self.add_log("Generating suffix array for {f}".format( # f=self.final_consensus_sa), level=logging.INFO) #sa_file = self.get_sa_file() # Create input.fasta.fofn from bas_fofn self.add_log("Creating fasta fofn from bas/bax.h5 fofn", level=logging.INFO) if self.fasta_fofn is None: self.fasta_fofn = op.join(self.nfl_dir, "input.fasta.fofn") self.add_log("bas fofn={f}".format(f=self.bas_fofn)) self.add_log("fasta fofn={f}".format(f=self.fasta_fofn)) if op.exists(self.fasta_fofn): self.add_log("No need to run convert_fofn_to_fasta.") else: convert_fofn_to_fasta(fofn_filename=self.bas_fofn, out_filename=self.fasta_fofn, fasta_out_dir=self.nfl_dir, cpus=self.sge_opts.blasr_nproc) # Split non-full-length reads into smaller fasta files # and save files to root_dir/nfl_00.fa, ..., . self.add_log("Splitting {nfl} into ".format(nfl=self.nfl_fa) + "smaller files each containing {n} reads.".format( n=self.nfl_reads_per_split), level=logging.INFO) self._nfl_splitted_fas = splitFasta(input_fasta=self.nfl_fa, reads_per_split=self.nfl_reads_per_split, out_dir=self.nfl_dir, out_prefix="input.split") msg = "Splitted files are: " + "\n".join(self._nfl_splitted_fas) self.add_log(msg, level=logging.INFO) # Generating dazz DB for final.consensus.fasta ref_obj = DazzIDHandler(self.final_consensus_fa, False) DalignerRunner.make_db(ref_obj.dazz_filename) msg = "Dazz DB made for: " + ref_obj.dazz_filename self.add_log(msg, level=logging.INFO) # Process nfl reads in each splitted fasta. self.add_log("Initializing IceAllPartials.", level=logging.INFO) #sa_file = self.final_consensus_sa \ # if op.exists(self.final_consensus_fa) else None self.icep = IceAllPartials( root_dir=self.root_dir, fasta_filenames=self._nfl_splitted_fas, ref_fasta=self.final_consensus_fa, out_pickle=self.nfl_all_pickle_fn, sge_opts=self.sge_opts, sa_file=None, # since we are switching to daligner, just give it as None now; remove sa_file completely later when daligner is mature (ToDo) ccs_fofn=self.ccs_fofn) self.add_log("IceAllPartials log: {f}.".format(f=self.icep.log_fn), level=logging.INFO) self.icep.run() self.add_log("IceAllPartials completed.", level=logging.INFO) self.add_log("Initializing IceQuiver.", level=logging.INFO) self.iceq = IceQuiver(root_dir=self.root_dir, bas_fofn=self.bas_fofn, fasta_fofn=self.fasta_fofn, sge_opts=self.sge_opts) self.add_log("IceQuiver log: {f}.".format(f=self.iceq.log_fn), level=logging.INFO) self.iceq.run() self.add_log("IceQuiver finished.", level=logging.INFO) self.add_log("Initializing IceQuiverPostprocess.", level=logging.INFO) self.icepq = IceQuiverPostprocess(root_dir=self.root_dir, use_sge=self.sge_opts.use_sge, quit_if_not_done=False, ipq_opts=self.ipq_opts) self.add_log("IceQuiverPostprocess log: {f}.". format(f=self.icepq.log_fn), level=logging.INFO) self.icepq.run() self.add_log("IceQuiverPostprocess finished.", level=logging.INFO)
def run(self): """ First, split non-full-length (nfl) fasta files into smaller chunks, assign nfl reads in each splitted fasta file into unpolished isoform clusters and then merge all pickles into self.nfl_all_pickle_fn. Second, bin every 100 clusters, for each bin, call blasr, samto5h, loadPulses, cmph5tools to create cmp.h5 files and call quiver to polish each isoforms within each bin. Finally, pick up good isoform clusters whose QV errors is less than a threshold. Save all high quality isoforms to hq_isoforms_fa|fq if they are not None Save all low quality isoforms to lq_isoforms_fa|fq if they are not None """ # Create final.consensus.fa.sa self.add_log("Generating suffix array for {f}".format( f=self.final_consensus_sa), level=logging.INFO) sa_file = self.get_sa_file() # Create input.fasta.fofn from bas_fofn self.add_log("Creating fasta fofn from bas/bax.h5 fofn", level=logging.INFO) if self.fasta_fofn is None: self.fasta_fofn = op.join(self.nfl_dir, "input.fasta.fofn") self.add_log("bas fofn={f}".format(f=self.bas_fofn)) self.add_log("fasta fofn={f}".format(f=self.fasta_fofn)) convert_fofn_to_fasta(fofn_filename=self.bas_fofn, out_filename=self.fasta_fofn, fasta_out_dir=self.nfl_dir) # Split non-full-length reads into smaller fasta files # and save files to root_dir/nfl_00.fa, ..., . self.add_log("Splitting {nfl} into ".format(nfl=self.nfl_fa) + "smaller files each containing {n} reads.".format( n=self.ice_opts.nfl_reads_per_split), level=logging.INFO) self._nfl_splitted_fas = splitFasta(input_fasta=self.nfl_fa, reads_per_split=self.ice_opts.nfl_reads_per_split, out_dir=self.nfl_dir, out_prefix="input.split") msg = "Splitted files are: " + "\n".join(self._nfl_splitted_fas) self.add_log(msg, level=logging.INFO) # Process nfl reads in each splitted fasta. self.add_log("IceAllPartials initiated.", level=logging.INFO) sa_file = self.final_consensus_sa \ if op.exists(self.final_consensus_fa) else None self.icep = IceAllPartials( root_dir=self.root_dir, fasta_filenames=self._nfl_splitted_fas, ref_fasta=self.final_consensus_fa, out_pickle=self.nfl_all_pickle_fn, sge_opts=self.sge_opts, sa_file=sa_file, ccs_fofn=self.ccs_fofn) self.icep.run() self.add_log("IceAllPartials completed.", level=logging.INFO) self.add_log("IceQuiver initiated.", level=logging.INFO) self.iceq = IceQuiver(root_dir=self.root_dir, bas_fofn=self.bas_fofn, fasta_fofn=self.fasta_fofn, sge_opts=self.sge_opts) self.iceq.run() self.add_log("IceQuiver finished.", level=logging.INFO) self.add_log("IcePostQuiver initiated.", level=logging.INFO) self.icepq = IcePostQuiver(root_dir=self.root_dir, hq_isoforms_fa=self.hq_isoforms_fa, hq_isoforms_fq=self.hq_isoforms_fq, lq_isoforms_fa=self.lq_isoforms_fa, lq_isoforms_fq=self.lq_isoforms_fq, use_sge=self.sge_opts.use_sge, quit_if_not_done=False) self.icepq.run() self.add_log("IcePostQuiver finished.", level=logging.INFO)
def run(self): """Call ICE to cluster consensus isoforms.""" self.add_log("Start to run cluster.", level=logging.INFO) if self.ice_opts.targeted_isoseq: first_split = 1000 self.ice_opts.flnc_reads_per_split = 10000 self.add_log( "targeted_isoseq: further splitting JUST first split to 1000. Changing flnc_reads_per_split=10000." ) else: first_split = None # Split flnc_fa into smaller files and save files to _flnc_splitted_fas. self.add_log( "Splitting {flnc} into ".format(flnc=self.flnc_fa) + "smaller files each containing {n} reads.".format(n=self.ice_opts.flnc_reads_per_split), level=logging.INFO, ) self._flnc_splitted_fas = splitFasta( input_fasta=self.flnc_fa, reads_per_split=self.ice_opts.flnc_reads_per_split, out_dir=self.root_dir, out_prefix="input.split", first_split=first_split, ) self.add_log("Splitted files are: " + "\n".join(self._flnc_splitted_fas), level=logging.INFO) firstSplit = self._flnc_splitted_fas[0] firstSplit_fq = firstSplit[: firstSplit.rfind(".")] + ".fastq" self.add_log( "Converting first split file {0} + {1} into fastq\n".format(firstSplit, self.ccs_fofn), level=logging.INFO ) # Convert this into FASTQ ice_fa2fq(firstSplit, self.ccs_fofn, firstSplit_fq) # Set up probabbility and quality value model if self.ice_opts.use_finer_qv: self._setProbQV_ccs(self.ccs_fofn, firstSplit) else: self._setProbQV_fq(firstSplitFq=firstSplit_fq) # Initialize cluster by clique if os.path.exists(self.initPickleFN): self.add_log("Reading existing uc pickle: {0}".format(self.initPickleFN), level=logging.INFO) with open(self.initPickleFN) as f: uc = cPickle.load(f) else: self.add_log("Finding maximal cliques: initializing IceInit.", level=logging.INFO) self.iceinit = IceInit( readsFa=firstSplit, qver_get_func=self._probqv.get_smoothed, ice_opts=self.ice_opts, sge_opts=self.sge_opts, qvmean_get_func=self._probqv.get_mean, ) uc = self.iceinit.uc # Dump uc to a file self.add_log("Dumping initial clusters to {f}".format(f=self.initPickleFN), level=logging.INFO) with open(self.initPickleFN, "w") as f: cPickle.dump(uc, f) # Run IceIterative. self.add_log("Iterative clustering: initializing IceIterative.", level=logging.INFO) # self.add_log("In Cluster. DEBUG: Calling Cluster with {0} {1} {2} ".format(self.bas_fofn, self.ccs_fofn, self.fasta_fofn)) self.icec = IceIterative( fasta_filename=firstSplit, fasta_filenames_to_add=self._flnc_splitted_fas[1:], all_fasta_filename=self.flnc_fa, ccs_fofn=self.ccs_fofn, root_dir=self.root_dir, ice_opts=self.ice_opts, sge_opts=self.sge_opts, uc=uc, probQV=self._probqv, fastq_filename=firstSplit_fq, use_ccs_qv=self.ice_opts.use_finer_qv, ) self.add_log("IceIterative log: {f}.".format(f=self.icec.log_fn)) self.icec.run() self.add_log("IceIterative completed.", level=logging.INFO) # IceIterative done, write predicted (unplished) consensus isoforms # to an output fasta self.add_log("Creating a link to unpolished consensus isoforms.") ln(self.icec.final_consensus_fa, self.out_fa) # self.add_log("In Cluster. DEBUG: End Cluster with {0} {1} {2} ".format(self.bas_fofn, self.ccs_fofn, self.fasta_fofn)) # Call quiver to polish predicted consensus isoforms. if self.ice_opts.quiver is not True: self.add_log("Creating a link to cluster report.", level=logging.INFO) ln(src=self.icec.report_fn, dst=self.report_fn) # Summarize cluster and write to summary_fn. self.write_summary(summary_fn=self.summary_fn, isoforms_fa=self.out_fa) else: # self.ice_opts.quiver is True self.add_log("Polishing clusters: initializing IcePolish.", level=logging.INFO) # self.add_log("In Cluster. DEBUG: Calling Polish with {0} {1} {2} ".format(self.bas_fofn, self.ccs_fofn, self.fasta_fofn)) self.pol = Polish( root_dir=self.root_dir, nfl_fa=self.nfl_fa, bas_fofn=self.bas_fofn, ccs_fofn=self.ccs_fofn, fasta_fofn=self.fasta_fofn, ice_opts=self.ice_opts, sge_opts=self.sge_opts, ipq_opts=self.ipq_opts, nfl_reads_per_split=self.nfl_reads_per_split, ) self.add_log("IcePolish log: {f}.".format(f=self.pol.log_fn), level=logging.INFO) self.pol.run() self.add_log("IcePolish completed.", level=logging.INFO) # cluster report self.add_log("Creating a link to cluster report.", level=logging.INFO) ln(src=self.pol.iceq.report_fn, dst=self.report_fn) # Summarize cluster & polish and write to summary_fn. self.write_summary( summary_fn=self.summary_fn, isoforms_fa=self.out_fa, hq_fa=self.pol.icepq.quivered_good_fa, lq_fa=self.pol.icepq.quivered_bad_fa, ) # Create log file. self.close_log() return 0
def run(self): """Call ICE to cluster consensus isoforms.""" self.add_log("Start to run cluster.", level=logging.INFO) #self.ice_opts.flnc_reads_per_split=1000 #FOR DEBUGGING, REMOVE LATER # Split flnc_fa into smaller files and save files to _flnc_splitted_fas. self.add_log("Splitting {flnc} into ".format(flnc=self.flnc_fa) + "smaller files each containing {n} reads.".format( n=self.ice_opts.flnc_reads_per_split), level=logging.INFO) self._flnc_splitted_fas = splitFasta( input_fasta=self.flnc_fa, reads_per_split=self.ice_opts.flnc_reads_per_split, out_dir=self.root_dir, out_prefix="input.split") self.add_log("Splitted files are: " + "\n".join(self._flnc_splitted_fas), level=logging.INFO) firstSplit = self._flnc_splitted_fas[0] firstSplit_fq = firstSplit[:firstSplit.rfind('.')] + '.fastq' self.add_log("Converting first split file {0} + {1} into fastq\n".format(\ firstSplit, self.ccs_fofn), level=logging.INFO) # Convert this into FASTQ ice_fa2fq(firstSplit, self.ccs_fofn, firstSplit_fq) # Set up probabbility and quality value model if self.ice_opts.use_finer_qv: self._setProbQV_ccs(self.ccs_fofn, firstSplit) else: self._setProbQV_fq(firstSplitFq=firstSplit_fq) # Initialize cluster by clique self.add_log("Finding maximal cliques: initializing IceInit.", level=logging.INFO) self.iceinit = IceInit(readsFa=firstSplit, qver_get_func=self._probqv.get_smoothed, ice_opts=self.ice_opts, sge_opts=self.sge_opts) uc = self.iceinit.uc # Dump uc to a file self.add_log( "Dumping initial clusters to {f}".format(f=self.initPickleFN), level=logging.INFO) with open(self.initPickleFN, 'w') as f: cPickle.dump(uc, f) # Run IceIterative. self.add_log("Iterative clustering: initializing IceIterative.", level=logging.INFO) self.icec = IceIterative( fasta_filename=firstSplit, fasta_filenames_to_add=self._flnc_splitted_fas[1:], all_fasta_filename=self.flnc_fa, ccs_fofn=self.ccs_fofn, root_dir=self.root_dir, ice_opts=self.ice_opts, sge_opts=self.sge_opts, uc=uc, probQV=self._probqv, fastq_filename=firstSplit_fq, use_ccs_qv=self.ice_opts.use_finer_qv) self.add_log("IceIterative log: {f}.".format(f=self.icec.log_fn)) self.icec.run() self.add_log("IceIterative completed.", level=logging.INFO) # IceIterative done, write predicted (unplished) consensus isoforms # to an output fasta self.add_log("Creating a link to unpolished consensus isoforms.") ln(self.icec.final_consensus_fa, self.out_fa) # Call quiver to polish predicted consensus isoforms. if self.ice_opts.quiver is not True: self.add_log("Creating a link to cluster report.", level=logging.INFO) ln(src=self.icec.report_fn, dst=self.report_fn) # Summarize cluster and write to summary_fn. self.write_summary(summary_fn=self.summary_fn, isoforms_fa=self.out_fa) else: # self.ice_opts.quiver is True self.add_log("Polishing clusters: initializing IcePolish.", level=logging.INFO) self.pol = Polish(root_dir=self.root_dir, nfl_fa=self.nfl_fa, bas_fofn=self.bas_fofn, ccs_fofn=self.ccs_fofn, fasta_fofn=self.fasta_fofn, ice_opts=self.ice_opts, sge_opts=self.sge_opts, ipq_opts=self.ipq_opts, nfl_reads_per_split=self.nfl_reads_per_split) self.add_log("IcePolish log: {f}.".format(f=self.pol.log_fn), level=logging.INFO) self.pol.run() self.add_log("IcePolish completed.", level=logging.INFO) # cluster report self.add_log("Creating a link to cluster report.", level=logging.INFO) ln(src=self.pol.iceq.report_fn, dst=self.report_fn) # Summarize cluster & polish and write to summary_fn. self.write_summary(summary_fn=self.summary_fn, isoforms_fa=self.out_fa, hq_fa=self.pol.icepq.quivered_good_fa, lq_fa=self.pol.icepq.quivered_bad_fa) # Create log file. self.close_log() return 0
def run(self): """Call ICE to cluster consensus isoforms.""" self.add_log("Start to run cluster.", level=logging.INFO) # Split flnc_fa into smaller files and save files to _flnc_splitted_fas. self.add_log("Splitting {flnc} into ".format(flnc=self.flnc_fa) + "smaller files each containing {n} reads.".format( n=self.ice_opts.flnc_reads_per_split), level=logging.INFO) self._flnc_splitted_fas = splitFasta( input_fasta=self.flnc_fa, reads_per_split=self.ice_opts.flnc_reads_per_split, out_dir=self.root_dir, out_prefix="input.split") self.add_log("Splitted files are: " + "\n".join(self._flnc_splitted_fas), level=logging.INFO) firstSplit = self._flnc_splitted_fas[0] # Set up probabbility and quality value model self._setProbQV(ccs_fofn=self.ccs_fofn, firstSplitFa=firstSplit) # Initialize cluster by clique # check if init.pickle already exists, if so, no need to run IceInit if os.path.exists(self.initPickleFN): self.add_log("{0} already exists. Reading to get uc.".format(self.initPickleFN), level=logging.INFO) with open(self.initPickleFN) as f: uc = cPickle.load(f) else: self.add_log("Finding maximal cliques.", level=logging.INFO) self.iceinit = IceInit(readsFa=firstSplit, qver_get_func=self._probqv.get_smoothed, ice_opts=self.ice_opts, sge_opts=self.sge_opts) uc = self.iceinit.uc # Dump uc to a file self.add_log("Dumping initial clusters to {f}".format( f=self.initPickleFN), level=logging.INFO) with open(self.initPickleFN, 'w') as f: cPickle.dump(uc, f) # Run IceIterative. self.add_log("Iteratively clustering.", level=logging.INFO) self.icec = IceIterative( fasta_filename=firstSplit, fasta_filenames_to_add=self._flnc_splitted_fas[1:], all_fasta_filename=self.flnc_fa, ccs_fofn=self.ccs_fofn, root_dir=self.root_dir, ice_opts=self.ice_opts, sge_opts=self.sge_opts, uc=uc, probQV=self._probqv) self.icec.run() clean_up_after_ICE(self.root_dir) # IceIterative done, write predicted (unplished) consensus isoforms # to an output fasta self.add_log("Creating a link to unpolished consensus isoforms.") ln(self.icec.final_consensus_fa, self.out_fa) # Call quiver to polish predicted consensus isoforms. if self.ice_opts.quiver is not True: self.add_log("Creating a link to cluster report.") ln(src=self.icec.report_fn, dst=self.report_fn) self.add_log("Writing a summary to {f}".format(f=self.summary_fn), level=logging.INFO) self.writeSummary(fa=self.out_fa, summary_fn=self.summary_fn) else: # self.ice_opts.quiver is True #TODO review code self.pol = Polish(root_dir=self.root_dir, nfl_fa=self.nfl_fa, bas_fofn=self.bas_fofn, ccs_fofn=self.ccs_fofn, hq_isoforms_fa=self.hq_isoforms_fa, hq_isoforms_fq=self.hq_isoforms_fq, lq_isoforms_fa=self.lq_isoforms_fa, lq_isoforms_fq=self.lq_isoforms_fq, ice_opts=self.ice_opts, sge_opts=self.sge_opts) self.pol.run() # cluster report self.add_log("Creating a link to cluster report.") ln(src=self.pol.iceq.report_fn, dst=self.report_fn) # Write a summary. self.add_log("Writing a summary to {f}".format(f=self.summary_fn), level=logging.INFO) self.writeSummary(fa=self.out_fa, summary_fn=self.summary_fn, hq_fa=self.pol.icepq.quivered_good_fa, lq_fa=self.pol.icepq.quivered_bad_fa) # Create log file. self.close_log() return 0
def run(self): """Call ICE to cluster consensus isoforms.""" self.add_log("Start to run cluster.", level=logging.INFO) #self.ice_opts.flnc_reads_per_split=1000 #FOR DEBUGGING, REMOVE LATER # Split flnc_fa into smaller files and save files to _flnc_splitted_fas. self.add_log("Splitting {flnc} into ".format(flnc=self.flnc_fa) + "smaller files each containing {n} reads.".format( n=self.ice_opts.flnc_reads_per_split), level=logging.INFO) self._flnc_splitted_fas = splitFasta( input_fasta=self.flnc_fa, reads_per_split=self.ice_opts.flnc_reads_per_split, out_dir=self.root_dir, out_prefix="input.split") self.add_log("Splitted files are: " + "\n".join(self._flnc_splitted_fas), level=logging.INFO) firstSplit = self._flnc_splitted_fas[0] firstSplit_fq = firstSplit[:firstSplit.rfind('.')] + '.fastq' self.add_log("Converting first split file {0} + {1} into fastq\n".format(\ firstSplit, self.ccs_fofn), level=logging.INFO) # Convert this into FASTQ ice_fa2fq(firstSplit, self.ccs_fofn, firstSplit_fq) # Set up probabbility and quality value model if self.ice_opts.use_finer_qv: self._setProbQV_ccs(self.ccs_fofn, firstSplit) else: self._setProbQV_fq(firstSplitFq=firstSplit_fq) # Initialize cluster by clique if os.path.exists(self.initPickleFN): self.add_log("Reading existing uc pickle: {0}".format(self.initPickleFN), level=logging.INFO) with open(self.initPickleFN) as f: uc = cPickle.load(f) else: self.add_log("Finding maximal cliques: initializing IceInit.", level=logging.INFO) self.iceinit = IceInit(readsFa=firstSplit, qver_get_func=self._probqv.get_smoothed, ice_opts=self.ice_opts, sge_opts=self.sge_opts) uc = self.iceinit.uc # Dump uc to a file self.add_log("Dumping initial clusters to {f}".format( f=self.initPickleFN), level=logging.INFO) with open(self.initPickleFN, 'w') as f: cPickle.dump(uc, f) # Run IceIterative. self.add_log("Iterative clustering: initializing IceIterative.", level=logging.INFO) self.icec = IceIterative( fasta_filename=firstSplit, fasta_filenames_to_add=self._flnc_splitted_fas[1:], all_fasta_filename=self.flnc_fa, ccs_fofn=self.ccs_fofn, root_dir=self.root_dir, ice_opts=self.ice_opts, sge_opts=self.sge_opts, uc=uc, probQV=self._probqv, fastq_filename=firstSplit_fq, use_ccs_qv=self.ice_opts.use_finer_qv) self.add_log("IceIterative log: {f}.".format(f=self.icec.log_fn)) return self.icec
def run(self): """Call ICE to cluster consensus isoforms.""" self.add_log("Start to run cluster.", level=logging.INFO) # Split flnc_fa into smaller files and save files to _flnc_splitted_fas. self.add_log("Splitting {flnc} into ".format(flnc=self.flnc_fa) + "smaller files each containing {n} reads.".format( n=self.ice_opts.flnc_reads_per_split), level=logging.INFO) self._flnc_splitted_fas = splitFasta( input_fasta=self.flnc_fa, reads_per_split=self.ice_opts.flnc_reads_per_split, out_dir=self.root_dir, out_prefix="input.split") self.add_log("Splitted files are: " + "\n".join(self._flnc_splitted_fas), level=logging.INFO) firstSplit = self._flnc_splitted_fas[0] # Set up probabbility and quality value model self._setProbQV(ccs_fofn=self.ccs_fofn, firstSplitFa=firstSplit) # Initialize cluster by clique # check if init.pickle already exists, if so, no need to run IceInit if os.path.exists(self.initPickleFN): self.add_log("{0} already exists. Reading to get uc.".format( self.initPickleFN), level=logging.INFO) with open(self.initPickleFN) as f: uc = cPickle.load(f) else: self.add_log("Finding maximal cliques.", level=logging.INFO) self.iceinit = IceInit(readsFa=firstSplit, qver_get_func=self._probqv.get_smoothed, ice_opts=self.ice_opts, sge_opts=self.sge_opts) uc = self.iceinit.uc # Dump uc to a file self.add_log( "Dumping initial clusters to {f}".format(f=self.initPickleFN), level=logging.INFO) with open(self.initPickleFN, 'w') as f: cPickle.dump(uc, f) # Run IceIterative. self.add_log("Iteratively clustering.", level=logging.INFO) self.icec = IceIterative( fasta_filename=firstSplit, fasta_filenames_to_add=self._flnc_splitted_fas[1:], all_fasta_filename=self.flnc_fa, ccs_fofn=self.ccs_fofn, root_dir=self.root_dir, ice_opts=self.ice_opts, sge_opts=self.sge_opts, uc=uc, probQV=self._probqv) self.icec.run() clean_up_after_ICE(self.root_dir) # IceIterative done, write predicted (unplished) consensus isoforms # to an output fasta self.add_log("Creating a link to unpolished consensus isoforms.") ln(self.icec.final_consensus_fa, self.out_fa) # Call quiver to polish predicted consensus isoforms. if self.ice_opts.quiver is not True: self.add_log("Creating a link to cluster report.") ln(src=self.icec.report_fn, dst=self.report_fn) self.add_log("Writing a summary to {f}".format(f=self.summary_fn), level=logging.INFO) self.writeSummary(fa=self.out_fa, summary_fn=self.summary_fn) else: # self.ice_opts.quiver is True #TODO review code self.pol = Polish(root_dir=self.root_dir, nfl_fa=self.nfl_fa, bas_fofn=self.bas_fofn, ccs_fofn=self.ccs_fofn, hq_isoforms_fa=self.hq_isoforms_fa, hq_isoforms_fq=self.hq_isoforms_fq, lq_isoforms_fa=self.lq_isoforms_fa, lq_isoforms_fq=self.lq_isoforms_fq, ice_opts=self.ice_opts, sge_opts=self.sge_opts) self.pol.run() # cluster report self.add_log("Creating a link to cluster report.") ln(src=self.pol.iceq.report_fn, dst=self.report_fn) # Write a summary. self.add_log("Writing a summary to {f}".format(f=self.summary_fn), level=logging.INFO) self.writeSummary(fa=self.out_fa, summary_fn=self.summary_fn, hq_fa=self.pol.icepq.quivered_good_fa, lq_fa=self.pol.icepq.quivered_bad_fa) # Create log file. self.close_log() return 0