def run(self): """Run""" logging.debug("root_dir: {d}.".format(d=self.root_dir)) logging.debug("nfl_fa: {f}.".format(f=self.nfl_fa)) logging.debug("Total number of chunks: N={N}.".format(N=self.N)) # Validate input files, (num_reads, reads_per_split, nfl_dir, splitted_fas_todo) = \ self.validate_inputs() logging.info("Total number of reads is {n}.".format(n=num_reads)) logging.info("Splitting nfl_fa into chunks each " + "containing {n} reads.".format(n=reads_per_split)) splitted_fas_done = splitFasta(input_fasta=real_ppath(self.nfl_fa), reads_per_split=reads_per_split, out_dir=nfl_dir, out_prefix="input.split") logging.info("Splitted files are: " + "\n".join(splitted_fas_done)) for fa in splitted_fas_todo: if fa not in splitted_fas_done: logging.info("touching {f}".format(f=fa)) touch(fa)
def run(self): """Run""" logging.debug("root_dir: {d}.".format(d=self.root_dir)) logging.debug("nfl_fa: {f}.".format(f=self.nfl_fa)) logging.debug("Total number of chunks: N={N}.".format(N=self.N)) # Validate input files, (num_reads, reads_per_split, nfl_dir, splitted_fas_todo) = self.validate_inputs() logging.info("Total number of reads is {n}.".format(n=num_reads)) logging.info("Splitting nfl_fa into chunks each " + "containing {n} reads.".format(n=reads_per_split)) splitted_fas_done = splitFasta( input_fasta=real_ppath(self.nfl_fa), reads_per_split=reads_per_split, out_dir=nfl_dir, out_prefix="input.split", ) logging.info("Splitted files are: " + "\n".join(splitted_fas_done)) for fa in splitted_fas_todo: if fa not in splitted_fas_done: logging.info("touching {f}".format(f=fa)) touch(fa)
def build_uc_from_partial_daligner(input_fastq, ref_fasta, out_pickle, ccs_fofn=None, done_filename=None, use_finer_qv=False, cpus=24, no_qv_or_aln_checking=True): """ Given an input_fasta file of non-full-length (partial) reads and (unpolished) consensus isoforms sequences in ref_fasta, align reads to consensus isoforms using BLASR, and then build up a mapping between consensus isoforms and reads (i.e., assign reads to isoforms). Finally, save {isoform_id: [read_ids], nohit: set(no_hit_read_ids)} to an output pickle file. ccs_fofn --- If None, assume no quality value is available, otherwise, use QV from ccs_fofn. """ input_fastq = realpath(input_fastq) input_fasta = input_fastq[:input_fastq.rfind('.')] + '.fasta' ice_fq2fa(input_fastq, input_fasta) ref_fasta = realpath(ref_fasta) out_pickle = realpath(out_pickle) output_dir = os.path.dirname(out_pickle) daligner_sensitive_mode, _low, _high, _ignore5, _ignore3, _ece_min_len = get_daligner_sensitivity_setting(ref_fasta, is_fasta=True) # DB should always be already converted ref_obj = DazzIDHandler(ref_fasta, True) input_obj = DazzIDHandler(input_fasta, False) # ice_partial is already being called through qsub, so run everything local! runner = DalignerRunner(input_fasta, ref_fasta, is_FL=False, same_strand_only=False, \ query_converted=True, db_converted=True, query_made=False, \ db_made=True, use_sge=False, cpus=cpus, sge_opts=None) las_filenames, las_out_filenames = runner.runHPC(min_match_len=_low, output_dir=output_dir, sensitive_mode=daligner_sensitive_mode) if no_qv_or_aln_checking: # not using QVs or alignment checking! # this probqv is just a DUMMY to pass to daligner_against_ref, which won't be used logging.info("Not using QV for partial_uc. Loading dummy QV.") probqv = ProbFromModel(.01, .07, .06) else: # if ccs_fofn is None: # logging.info("Loading probability from model (0.01,0.07,0.06)") # probqv = ProbFromModel(.01, .07, .06) # else: start_t = time.time() probqv = ProbFromFastq(input_fastq) logging.info("Loading QVs from {fq} took {s} secs".format(fq=input_fastq, s=time.time()-start_t)) # --------- comment out below since we are just using FASTQ / BAM # if use_finer_qv: # probqv = ProbFromQV(input_fofn=ccs_fofn, fasta_filename=input_fasta) # logging.info("Loading QVs from {i} + {f} took {s} secs".format(f=ccs_fofn, i=input_fasta,\ # s=time.time()-start_t)) # else: # input_fastq = input_fasta[:input_fasta.rfind('.')] + '.fastq' # logging.info("Converting {i} + {f} --> {fq}".format(i=input_fasta, f=ccs_fofn, fq=input_fastq)) # ice_fa2fq(input_fasta, ccs_fofn, input_fastq) # probqv = ProbFromFastq(input_fastq) # logging.info("Loading QVs from {fq} took {s} secs".format(fq=input_fastq, s=time.time()-start_t)) # print >> sys.stderr, "Loading QVs from {fq} took {s} secs".format(fq=input_fastq, s=time.time()-start_t) logging.info("Calling dalign_against_ref ...") partial_uc = {} # Maps each isoform (cluster) id to a list of reads # which can map to the isoform seen = set() # reads seen logging.info("Building uc from DALIGNER hits.") for las_out_filename in las_out_filenames: start_t = time.time() hitItems = dalign_against_ref(input_obj, ref_obj, las_out_filename, is_FL=False, sID_starts_with_c=True, qver_get_func=probqv.get_smoothed, qvmean_get_func=probqv.get_mean, ece_penalty=1, ece_min_len=_ece_min_len, same_strand_only=False, no_qv_or_aln_checking=no_qv_or_aln_checking, max_missed_start=_ignore5, max_missed_end=_ignore3) for h in hitItems: if h.ece_arr is not None: if h.cID not in partial_uc: partial_uc[h.cID] = set() partial_uc[h.cID].add(h.qID) seen.add(h.qID) logging.info("processing {0} took {1} sec".format(las_out_filename, time.time()-start_t)) print >> sys.stderr, "processing {0} took {1} sec".format(las_out_filename, time.time()-start_t) for k in partial_uc: partial_uc[k] = list(partial_uc[k]) allhits = set(r.name.split()[0] for r in FastaReader(input_fasta)) logging.info("Counting reads with no hit.") nohit = allhits.difference(seen) logging.info("Dumping uc to a pickle: {f}.".format(f=out_pickle)) with open(out_pickle, 'w') as f: dump({'partial_uc': partial_uc, 'nohit': nohit}, f) done_filename = realpath(done_filename) if done_filename is not None \ else out_pickle + '.DONE' logging.debug("Creating {f}.".format(f=done_filename)) touch(done_filename) # remove all the .las and .las.out filenames for file in las_filenames: os.remove(file) for file in las_out_filenames: os.remove(file)
def build_uc_from_partial(input_fasta, ref_fasta, out_pickle, sa_file=None, ccs_fofn=None, done_filename=None, blasr_nproc=12, use_finer_qv=False): """ Given an input_fasta file of non-full-length (partial) reads and (unpolished) consensus isoforms sequences in ref_fasta, align reads to consensus isoforms using BLASR, and then build up a mapping between consensus isoforms and reads (i.e., assign reads to isoforms). Finally, save {isoform_id: [read_ids], nohit: set(no_hit_read_ids)} to an output pickle file. ccs_fofn --- If None, assume no quality value is available, otherwise, use QV from ccs_fofn. blasr_nproc --- equivalent to blasr -nproc, number of CPUs to use """ input_fasta = realpath(input_fasta) m5_file = input_fasta + ".blasr" out_pickle = realpath(out_pickle) if sa_file is None: if op.exists(input_fasta + ".sa"): sa_file = input_fasta + ".sa" cmd = "blasr {i} ".format(i=real_upath(input_fasta)) + \ "{r} -bestn 5 ".format(r=real_upath(ref_fasta)) + \ "-nproc {n} -m 5 ".format(n=blasr_nproc) + \ "-maxScore -1000 -minPctIdentity 85 " + \ "-out {o} ".format(o=real_upath(m5_file)) if sa_file is not None and op.exists(sa_file): cmd += "-sa {sa}".format(sa=real_upath(sa_file)) logging.info("CMD: {cmd}".format(cmd=cmd)) _out, _code, _msg = backticks(cmd) if _code != 0: errMsg = "Command failed: {cmd}\n{e}".format(cmd=cmd, e=_msg) logging.error(errMsg) raise RuntimeError(errMsg) if ccs_fofn is None: logging.info("Loading probability from model (0.01,0.07,0.06)") probqv = ProbFromModel(.01, .07, .06) else: start_t = time.time() if use_finer_qv: logging.info("Loading QVs from {i} + {f} took {s} secs".format(f=ccs_fofn, i=input_fasta,\ s=time.time()-start_t)) probqv = ProbFromQV(input_fofn=ccs_fofn, fasta_filename=input_fasta) else: input_fastq = input_fasta[:input_fasta.rfind('.')] + '.fastq' logging.info("Converting {i} + {f} --> {fq}".format(i=input_fasta, f=ccs_fofn, fq=input_fastq)) ice_fa2fq(input_fasta, ccs_fofn, input_fastq) logging.info("Loading QVs from {fq} took {s} secs".format(fq=input_fastq, s=time.time()-start_t)) probqv = ProbFromFastq(input_fastq) logging.info("Calling blasr_against_ref ...") hitItems = blasr_against_ref(output_filename=m5_file, is_FL=False, sID_starts_with_c=True, qvmean_get_func=probqv.get_mean, qver_get_func=probqv.get_smoothed, ece_penalty=1, ece_min_len=20, same_strand_only=False, max_missed_start=200, max_missed_end=50) partial_uc = {} # Maps each isoform (cluster) id to a list of reads # which can map to the isoform seen = set() # reads seen logging.info("Building uc from BLASR hits.") for h in hitItems: if h.ece_arr is not None: if h.cID not in partial_uc: partial_uc[h.cID] = set() partial_uc[h.cID].add(h.qID) seen.add(h.qID) for k in partial_uc: partial_uc[k] = list(partial_uc[k]) allhits = set(r.name.split()[0] for r in FastaReader(input_fasta)) logging.info("Counting reads with no hit.") nohit = allhits.difference(seen) logging.info("Dumping uc to a pickle: {f}.".format(f=out_pickle)) with open(out_pickle, 'w') as f: dump({'partial_uc': partial_uc, 'nohit': nohit}, f) os.remove(m5_file) done_filename = realpath(done_filename) if done_filename is not None \ else out_pickle + '.DONE' logging.debug("Creating {f}.".format(f=done_filename)) touch(done_filename)
def build_uc_from_partial_daligner(input_fasta, ref_fasta, out_pickle, ccs_fofn=None, done_filename=None, use_finer_qv=False, cpus=24, no_qv_or_aln_checking=True): """ Given an input_fasta file of non-full-length (partial) reads and (unpolished) consensus isoforms sequences in ref_fasta, align reads to consensus isoforms using BLASR, and then build up a mapping between consensus isoforms and reads (i.e., assign reads to isoforms). Finally, save {isoform_id: [read_ids], nohit: set(no_hit_read_ids)} to an output pickle file. ccs_fofn --- If None, assume no quality value is available, otherwise, use QV from ccs_fofn. """ input_fasta = realpath(input_fasta) ref_fasta = realpath(ref_fasta) out_pickle = realpath(out_pickle) output_dir = os.path.dirname(out_pickle) daligner_sensitive_mode, _low, _high = get_daligner_sensitivity_setting(ref_fasta) # DB should always be already converted ref_obj = DazzIDHandler(ref_fasta, True) input_obj = DazzIDHandler(input_fasta, False) # ice_partial is already being called through qsub, so run everything local! runner = DalignerRunner(input_fasta, ref_fasta, is_FL=False, same_strand_only=False, \ query_converted=True, db_converted=True, query_made=False, \ db_made=True, use_sge=False, cpus=cpus, sge_opts=None) las_filenames, las_out_filenames = runner.runHPC(min_match_len=300, output_dir=output_dir, sensitive_mode=daligner_sensitive_mode) if no_qv_or_aln_checking: # not using QVs or alignment checking! # this probqv is just a DUMMY to pass to daligner_against_ref, which won't be used logging.info("Not using QV for partial_uc. Loading dummy QV.") probqv = ProbFromModel(.01, .07, .06) else: if ccs_fofn is None: logging.info("Loading probability from model (0.01,0.07,0.06)") probqv = ProbFromModel(.01, .07, .06) else: start_t = time.time() if use_finer_qv: probqv = ProbFromQV(input_fofn=ccs_fofn, fasta_filename=input_fasta) logging.info("Loading QVs from {i} + {f} took {s} secs".format(f=ccs_fofn, i=input_fasta,\ s=time.time()-start_t)) else: input_fastq = input_fasta[:input_fasta.rfind('.')] + '.fastq' logging.info("Converting {i} + {f} --> {fq}".format(i=input_fasta, f=ccs_fofn, fq=input_fastq)) ice_fa2fq(input_fasta, ccs_fofn, input_fastq) probqv = ProbFromFastq(input_fastq) logging.info("Loading QVs from {fq} took {s} secs".format(fq=input_fastq, s=time.time()-start_t)) print >> sys.stderr, "Loading QVs from {fq} took {s} secs".format(fq=input_fastq, s=time.time()-start_t) logging.info("Calling dalign_against_ref ...") partial_uc = {} # Maps each isoform (cluster) id to a list of reads # which can map to the isoform seen = set() # reads seen logging.info("Building uc from DALIGNER hits.") for las_out_filename in las_out_filenames: start_t = time.time() hitItems = dalign_against_ref(input_obj, ref_obj, las_out_filename, is_FL=False, sID_starts_with_c=True, qver_get_func=probqv.get_smoothed, qvmean_get_func=probqv.get_mean, ece_penalty=1, ece_min_len=20, same_strand_only=False, no_qv_or_aln_checking=no_qv_or_aln_checking) for h in hitItems: if h.ece_arr is not None: if h.cID not in partial_uc: partial_uc[h.cID] = set() partial_uc[h.cID].add(h.qID) seen.add(h.qID) logging.info("processing {0} took {1} sec".format(las_out_filename, time.time()-start_t)) print >> sys.stderr, "processing {0} took {1} sec".format(las_out_filename, time.time()-start_t) for k in partial_uc: partial_uc[k] = list(partial_uc[k]) allhits = set(r.name.split()[0] for r in FastaReader(input_fasta)) logging.info("Counting reads with no hit.") nohit = allhits.difference(seen) logging.info("Dumping uc to a pickle: {f}.".format(f=out_pickle)) with open(out_pickle, 'w') as f: dump({'partial_uc': partial_uc, 'nohit': nohit}, f) done_filename = realpath(done_filename) if done_filename is not None \ else out_pickle + '.DONE' logging.debug("Creating {f}.".format(f=done_filename)) touch(done_filename) # remove all the .las and .las.out filenames for file in las_filenames: os.remove(file) for file in las_out_filenames: os.remove(file)
def build_uc_from_partial(input_fasta, ref_fasta, out_pickle, sa_file=None, ccs_fofn=None, done_filename=None, blasr_nproc=12, use_finer_qv=False): """ Given an input_fasta file of non-full-length (partial) reads and (unpolished) consensus isoforms sequences in ref_fasta, align reads to consensus isoforms using BLASR, and then build up a mapping between consensus isoforms and reads (i.e., assign reads to isoforms). Finally, save {isoform_id: [read_ids], nohit: set(no_hit_read_ids)} to an output pickle file. ccs_fofn --- If None, assume no quality value is available, otherwise, use QV from ccs_fofn. blasr_nproc --- equivalent to blasr -nproc, number of CPUs to use """ input_fasta = realpath(input_fasta) m5_file = input_fasta + ".blasr" out_pickle = realpath(out_pickle) if sa_file is None: if op.exists(input_fasta + ".sa"): sa_file = input_fasta + ".sa" cmd = "blasr {i} ".format(i=real_upath(input_fasta)) + \ "{r} -bestn 5 ".format(r=real_upath(ref_fasta)) + \ "-nproc {n} -m 5 ".format(n=blasr_nproc) + \ "-maxScore -1000 -minPctIdentity 85 " + \ "-out {o} ".format(o=real_upath(m5_file)) if sa_file is not None and op.exists(sa_file): cmd += "-sa {sa}".format(sa=real_upath(sa_file)) logging.info("CMD: {cmd}".format(cmd=cmd)) _out, _code, _msg = backticks(cmd) if _code != 0: errMsg = "Command failed: {cmd}\n{e}".format(cmd=cmd, e=_msg) logging.error(errMsg) raise RuntimeError(errMsg) if ccs_fofn is None: logging.info("Loading probability from model (0.01,0.07,0.06)") probqv = ProbFromModel(.01, .07, .06) else: start_t = time.time() if use_finer_qv: logging.info("Loading QVs from {i} + {f} took {s} secs".format(f=ccs_fofn, i=input_fasta,\ s=time.time()-start_t)) probqv = ProbFromQV(input_fofn=ccs_fofn, fasta_filename=input_fasta) else: input_fastq = input_fasta[:input_fasta.rfind('.')] + '.fastq' logging.info("Converting {i} + {f} --> {fq}".format(i=input_fasta, f=ccs_fofn, fq=input_fastq)) ice_fa2fq(input_fasta, ccs_fofn, input_fastq) logging.info("Loading QVs from {fq} took {s} secs".format(fq=input_fastq, s=time.time()-start_t)) probqv = ProbFromFastq(input_fastq) logging.info("Calling blasr_against_ref ...") hitItems = blasr_against_ref(output_filename=m5_file, is_FL=False, sID_starts_with_c=True, qvmean_get_func=probqv.get_mean, qver_get_func=probqv.get_smoothed, ece_penalty=1, ece_min_len=10, same_strand_only=False) partial_uc = {} # Maps each isoform (cluster) id to a list of reads # which can map to the isoform seen = set() # reads seen logging.info("Building uc from BLASR hits.") for h in hitItems: if h.ece_arr is not None: if h.cID not in partial_uc: partial_uc[h.cID] = set() partial_uc[h.cID].add(h.qID) seen.add(h.qID) for k in partial_uc: partial_uc[k] = list(partial_uc[k]) allhits = set(r.name.split()[0] for r in FastaReader(input_fasta)) logging.info("Counting reads with no hit.") nohit = allhits.difference(seen) logging.info("Dumping uc to a pickle: {f}.".format(f=out_pickle)) with open(out_pickle, 'w') as f: dump({'partial_uc': partial_uc, 'nohit': nohit}, f) os.remove(m5_file) done_filename = realpath(done_filename) if done_filename is not None \ else out_pickle + '.DONE' logging.debug("Creating {f}.".format(f=done_filename)) touch(done_filename)
def build_uc_from_partial(input_fasta, ref_fasta, out_pickle, sa_file=None, ccs_fofn=None, done_filename=None, blasr_nproc=12): """Align consensus isoforms in ref_fasta and reads in input_fasta, and save mappings between isoforms and reads to out_pickle. ccs_fofn --- If None, assume no quality value is available, otherwise, use QV from ccs_fofn. blasr_nproc --- equivalent to blasr -nproc, number of CPUs to use """ input_fasta = realpath(input_fasta) m5_file = input_fasta + ".blasr" out_pickle = realpath(out_pickle) if sa_file is None: if op.exists(input_fasta + ".sa"): sa_file = input_fasta + ".sa" cmd = "blasr {i} ".format(i=input_fasta) + \ "{r} -bestn 5 ".format(r=ref_fasta) + \ "-nproc {n} -m 5 ".format(n=blasr_nproc) + \ "-maxScore -1000 -minPctIdentity 85 -out {o} ".format(o=m5_file) if sa_file is not None and op.exists(sa_file): cmd += "-sa {sa}".format(sa=sa_file) logging.info("CMD: {cmd}".format(cmd=cmd)) _out, _code, _msg = backticks(cmd) if _code != 0: errMsg = "Command failed: {cmd}\n{e}".format(cmd=cmd, e=_msg) logging.error(errMsg) raise RuntimeError(errMsg) if ccs_fofn is None: logging.info("Loading probability from model") probqv = ProbFromModel(.01, .07, .06) else: logging.info("Loading probability from QV in {f}".format(f=ccs_fofn)) probqv = ProbFromQV(input_fofn=ccs_fofn, fasta_filename=input_fasta) logging.info("Calling blasr_against_ref ...") hitItems = blasr_against_ref(output_filename=m5_file, is_FL=False, sID_starts_with_c=True, qver_get_func=probqv.get_smoothed, ece_penalty=1, ece_min_len=10, same_strand_only=False) partial_uc = {} # Maps each isoform (cluster) id to a list of reads # which can map to the isoform seen = set() # reads seen logging.info("Building uc from BLASR hits.") for h in hitItems: if h.ece_arr is not None: if h.cID not in partial_uc: partial_uc[h.cID] = [] partial_uc[h.cID].append(h.qID) seen.add(h.qID) allhits = set(r.name.split()[0] for r in FastaReader(input_fasta)) logging.info("Counting reads with no hit.") nohit = allhits.difference(seen) logging.info("Dumping uc to a pickle: {f}.".format(f=out_pickle)) with open(out_pickle, 'w') as f: dump({'partial_uc': partial_uc, 'nohit': nohit}, f) os.remove(m5_file) done_filename = realpath(done_filename) if done_filename is not None \ else out_pickle + '.DONE' logging.debug("Creating {f}.".format(f=done_filename)) touch(done_filename)
def build_uc_from_partial( input_fasta, ref_fasta, out_pickle, sa_file=None, ccs_fofn=None, done_filename=None, blasr_nproc=12 ): """Align consensus isoforms in ref_fasta and reads in input_fasta, and save mappings between isoforms and reads to out_pickle. ccs_fofn --- If None, assume no quality value is available, otherwise, use QV from ccs_fofn. blasr_nproc --- equivalent to blasr -nproc, number of CPUs to use """ input_fasta = realpath(input_fasta) m5_file = input_fasta + ".blasr" out_pickle = realpath(out_pickle) if sa_file is None: if op.exists(input_fasta + ".sa"): sa_file = input_fasta + ".sa" cmd = ( "blasr {i} ".format(i=input_fasta) + "{r} -bestn 5 ".format(r=ref_fasta) + "-nproc {n} -m 5 ".format(n=blasr_nproc) + "-maxScore -1000 -minPctIdentity 85 -out {o} ".format(o=m5_file) ) if sa_file is not None and op.exists(sa_file): cmd += "-sa {sa}".format(sa=sa_file) logging.info("CMD: {cmd}".format(cmd=cmd)) _out, _code, _msg = backticks(cmd) if _code != 0: errMsg = "Command failed: {cmd}\n{e}".format(cmd=cmd, e=_msg) logging.error(errMsg) raise RuntimeError(errMsg) if ccs_fofn is None: logging.info("Loading probability from model") probqv = ProbFromModel(0.01, 0.07, 0.06) else: logging.info("Loading probability from QV in {f}".format(f=ccs_fofn)) probqv = ProbFromQV(input_fofn=ccs_fofn, fasta_filename=input_fasta) logging.info("Calling blasr_against_ref ...") hitItems = blasr_against_ref( output_filename=m5_file, is_FL=False, sID_starts_with_c=True, qver_get_func=probqv.get_smoothed, ece_penalty=1, ece_min_len=10, same_strand_only=False, ) partial_uc = {} # Maps each isoform (cluster) id to a list of reads # which can map to the isoform seen = set() # reads seen logging.info("Building uc from BLASR hits.") for h in hitItems: if h.ece_arr is not None: if h.cID not in partial_uc: partial_uc[h.cID] = [] partial_uc[h.cID].append(h.qID) seen.add(h.qID) allhits = set(r.name.split()[0] for r in FastaReader(input_fasta)) logging.info("Counting reads with no hit.") nohit = allhits.difference(seen) logging.info("Dumping uc to a pickle: {f}.".format(f=out_pickle)) with open(out_pickle, "w") as f: dump({"partial_uc": partial_uc, "nohit": nohit}, f) os.remove(m5_file) done_filename = realpath(done_filename) if done_filename is not None else out_pickle + ".DONE" logging.debug("Creating {f}.".format(f=done_filename)) touch(done_filename)