def reconstruct_ref_fa_for_clusters_in_bin(self, cids, refs): """ Reconstruct ref_fa of the cluster in the new tmp_dir e.g., self.g_consensus_ref_fa_of_cluster(cid) cids --- list[int(cid)], e.g., [10, 11, 12, ..., 20] refs --- dict{int(cid): ref_fa of cluster(cid)} """ # Check existence when first time it is read. if not nfs_exists(self.final_consensus_fa): raise IOError("Final consensus FASTA file {f}".format( f=self.final_consensus_fa) + "does not exist.") self.add_log("Reconstructing g consensus files for clusters " "[%d, %d] in %s" % (cids[0], cids[-1], self.tmp_dir), level=logging.INFO) final_consensus_d = FastaRandomReader(self.final_consensus_fa) for ref_id in final_consensus_d.d.keys(): cid = int(ref_id.split('/')[0].replace('c', '')) # e.g., ref_id = c103/1/3708, cid = 103, # refs[cid] = ...tmp/0/c103/g_consensus_ref.fasta if cid in cids: mkdir(self.cluster_dir(cid)) ref_fa = op.join(self.cluster_dir(cid), op.basename(refs[cid])) refs[cid] = ref_fa with FastaWriter(ref_fa) as writer: self.add_log("Writing ref_fa %s" % refs[cid]) writer.writeRecord(ref_id, final_consensus_d[ref_id].sequence[:]) self.add_log("Reconstruct of g consensus files completed.", level=logging.INFO)
def reconstruct_ref_fa_for_clusters_in_bin(self, cids, refs): """ Reconstruct ref_fa of the cluster in the new tmp_dir e.g., self.g_consensus_ref_fa_of_cluster(cid) Liz: new cids after ice2 collection is b<bin>_c<cid> refs --- dict{int(cid): ref_fa of cluster(cid)} """ # Check existence when first time it is read. if not nfs_exists(self.final_consensus_fa): raise IOError("Final consensus FASTA file {f}".format( f=self.final_consensus_fa) + "does not exist.") print("Reconstructing g consensus files for clusters {0}, {1} in {2}". format(cids[0], cids[-1], self.tmp_dir)) self.add_log( "Reconstructing g consensus files for clusters {0}, {1} in {2}". format(cids[0], cids[-1], self.tmp_dir)) final_consensus_d = FastaRandomReader(self.final_consensus_fa) for ref_id in list(final_consensus_d.d.keys()): # Liz: this is no longer valid for the Ice2 cids #cid = int(ref_id.split('/')[0].replace('c', '')) cid = ref_id if cid in cids: _dir = self.cluster_dir_for_reconstructed_ref(cid) mkdir(_dir) ref_fa = op.join(_dir, op.basename(refs[cid])) refs[cid] = ref_fa with FastaWriter(ref_fa) as writer: self.add_log("Writing ref_fa %s" % refs[cid]) writer.writeRecord(ref_id, final_consensus_d[ref_id].sequence[:]) self.add_log("Reconstruct of g consensus files completed.", level=logging.INFO)
def choose_template_by_blasr(fasta_filename, out_filename, nproc=8, maxScore=-1000, min_number_reads=1): """ Choose the best template for gcon reference Pick the one that has the highest average hit identity to others Returns: FastaRecord of selected ref """ fd = FastaRandomReader(fasta_filename) cmd = "blasr --nproc {nproc} ".format(nproc=nproc) + \ "--maxScore {score} ".format(score=maxScore) + \ "--maxLCPLength 15 --bestn 10 --nCandidates 50 " + \ "-m 1 {fa} {fa} ".format(fa=fasta_filename) + \ "--out {out} ".format(out=out_filename) + \ "1>/dev/null 2>/dev/null" out, code, msg = backticks(cmd) if code != 0: return None # blasr -m 1 output format: # (0) qName (1) tName (2) qStrand # (3) tStrand (4) score (5) percentSimilarity # (6) tStart (7) tEnd (8) tLength # (9) qStart (10) qEnd (11) qLength # (12) nCells scores = defaultdict(lambda: []) with open(out_filename) as f: for line in f: raw = line.strip().split() # qID gets an extra /0_length qID, tID = raw[0][:raw[0].rfind('/')], raw[1] if qID == tID: continue # self-hit, ignore if raw[2] != raw[3]: continue # has to be on same strand scores[qID].append(abs(float( raw[4]))) # Liz: use score as the scorer! # find the one with the highest average alignment similarity score_array = [] for k, v in scores.iteritems(): score_array.append((np.ceil(np.mean(v)), k)) if len(score_array) < min_number_reads: errMsg = "Not enough number of reads in " + \ "choose_template_by_blasr {0} < {1}".format( len(score_array), min_number_reads) raise AlignGraphUtilError(errMsg) score_array.sort(reverse=True) # Find the longest sequence that is within the std deviation of # the best score best_mean_std = np.std([x[0] for x in score_array]) best_mean, best_id = score_array[0] best_len = len(fd[best_id].sequence) for _mean, _id in score_array[1:]: if abs(_mean - best_mean) > best_mean_std: break _len = len(fd[_id].sequence) if _len > best_len: best_id = _id best_len = _len return fd[best_id]
def pick_rep(isoform_filename, gff_filename, group_filename, output_filename, pick_least_err_instead=False, bad_gff_filename=None): """ For each group of collapsed sam records, select the representative record. If is FASTA file -- then always pick the longest one If is FASTQ file -- then If pick_least_err_instead is True, pick the one w/ least number of expected base errors Else, pick the longest one """ fd = None is_fq = False dummy_prefix, _suffix = parse_ds_filename(isoform_filename) if _suffix == "fasta": fd = FastaRandomReader(isoform_filename) elif _suffix == "fastq": fd = FastqRandomReader(isoform_filename) is_fq = True elif _suffix == "contigset.xml": fd = ContigSet(isoform_filename) _fns = fd.toExternalFiles() if len(_fns) == 1 and _fns[0].endswith(".fq") or _fns[0].endswith( ".fastq"): fd = FastqRandomReader(_fns[0]) is_fq = True else: if not fd.isIndexed: # Must be indexed FASTA, or exactly contains one FASTQ file raise IOError( "%s must contain either indexed FASTA files or " % isoform_filename + "contain exactly one FASTQ file!") else: raise IOError("Unable to recognize file type of %s." % isoform_filename) fa_out_fn, fq_out_fn, ds_out_fn = None, None, None _prefix, _suffix = parse_ds_filename(output_filename) if _suffix == "fasta": fa_out_fn = output_filename elif _suffix == "fastq": if not is_fq: raise ValueError("Input file %s is not FASTQ while output is." % isoform_filename) else: fq_out_fn = output_filename elif _suffix == "contigset.xml": # output is contigset.xml ds_out_fn = output_filename fa_out_fn = _prefix + ".fasta" if is_fq: fq_out_fn = _prefix + ".fastq" else: raise IOError("Unable to recognize file type of %s." % output_filename) fa_writer = FastaWriter(fa_out_fn) if fa_out_fn is not None else None fq_writer = FastqWriter(fq_out_fn) if fq_out_fn is not None else None coords = {} for r in CollapseGffReader(gff_filename): tid = r.transcript_id coords[tid] = "{0}:{1}-{2}({3})".format(r.seqid, r.start, r.end, r.strand) if bad_gff_filename is not None: for r in CollapseGffReader(gff_filename): tid = r.transcript_id coords[tid] = "{0}:{1}-{2}({3})".format(r.seqid, r.start, r.end, r.strand) for group in GroupReader(group_filename): pb_id, members = group.name, group.members if not pb_id in coords: raise ValueError("Could not find %s in %s and %s" % (pb_id, gff_filename, bad_gff_filename)) #logging.info("Picking representative sequence for %s", pb_id) best_id = None best_seq = None best_qual = None best_err = 9999999 err = 9999999 max_len = 0 for x in members: if is_fq and pick_least_err_instead: err = sum(i**-(i / 10.) for i in fd[x].quality) if (is_fq and pick_least_err_instead and err < best_err) or \ ((not is_fq or not pick_least_err_instead) and len(fd[x].sequence) >= max_len): best_id = x best_seq = fd[x].sequence if is_fq: best_qual = fd[x].quality best_err = err max_len = len(fd[x].sequence) _id_ = "{0}|{1}|{2}".format(pb_id, coords[pb_id], best_id) _seq_ = best_seq if fq_writer is not None: fq_writer.writeRecord(_id_, _seq_, best_qual) if fa_writer is not None: fa_writer.writeRecord(_id_, _seq_) if fa_writer is not None: fa_writer.close() if fq_writer is not None: fq_writer.close() if ds_out_fn is not None: as_contigset(fa_out_fn, ds_out_fn)
def pick_rep(isoform_filename, gff_filename, group_filename, output_filename, pick_least_err_instead=False, bad_gff_filename=None): """ For each group of collapsed sam records, select the representative record. If is FASTA file -- then always pick the longest one If is FASTQ file -- then If pick_least_err_instead is True, pick the one w/ least number of expected base errors Else, pick the longest one """ fd = None is_fq = False dummy_prefix, _suffix = parse_ds_filename(isoform_filename) if _suffix == "fasta": fd = FastaRandomReader(isoform_filename) elif _suffix == "fastq": fd = FastqRandomReader(isoform_filename) is_fq = True elif _suffix == "contigset.xml": fd = ContigSet(isoform_filename) _fns = fd.toExternalFiles() if len(_fns) == 1 and _fns[0].endswith(".fq") or _fns[0].endswith(".fastq"): fd = FastqRandomReader(_fns[0]) is_fq = True else: if not fd.isIndexed: # Must be indexed FASTA, or exactly contains one FASTQ file raise IOError("%s must contain either indexed FASTA files or " % isoform_filename + "contain exactly one FASTQ file!") else: raise IOError("Unable to recognize file type of %s." % isoform_filename) fa_out_fn, fq_out_fn, ds_out_fn = None, None, None _prefix, _suffix = parse_ds_filename(output_filename) if _suffix == "fasta": fa_out_fn = output_filename elif _suffix == "fastq": if not is_fq: raise ValueError("Input file %s is not FASTQ while output is." % isoform_filename) else: fq_out_fn = output_filename elif _suffix == "contigset.xml": # output is contigset.xml ds_out_fn = output_filename fa_out_fn = _prefix + ".fasta" if is_fq: fq_out_fn = _prefix + ".fastq" else: raise IOError("Unable to recognize file type of %s." % output_filename) fa_writer = FastaWriter(fa_out_fn) if fa_out_fn is not None else None fq_writer = FastqWriter(fq_out_fn) if fq_out_fn is not None else None coords = {} for r in CollapseGffReader(gff_filename): tid = r.transcript_id coords[tid] = "{0}:{1}-{2}({3})".format(r.seqid, r.start, r.end, r.strand) if bad_gff_filename is not None: for r in CollapseGffReader(gff_filename): tid = r.transcript_id coords[tid] = "{0}:{1}-{2}({3})".format(r.seqid, r.start, r.end, r.strand) for group in GroupReader(group_filename): pb_id, members = group.name, group.members if not pb_id in coords: raise ValueError("Could not find %s in %s and %s" % (pb_id, gff_filename, bad_gff_filename)) #logging.info("Picking representative sequence for %s", pb_id) best_id = None best_seq = None best_qual = None best_err = 9999999 err = 9999999 max_len = 0 for x in members: if is_fq and pick_least_err_instead: err = sum(i**-(i/10.) for i in fd[x].quality) if (is_fq and pick_least_err_instead and err < best_err) or \ ((not is_fq or not pick_least_err_instead) and len(fd[x].sequence) >= max_len): best_id = x best_seq = fd[x].sequence if is_fq: best_qual = fd[x].quality best_err = err max_len = len(fd[x].sequence) _id_ = "{0}|{1}|{2}".format(pb_id, coords[pb_id], best_id) _seq_ = best_seq if fq_writer is not None: fq_writer.writeRecord(_id_, _seq_, best_qual) if fa_writer is not None: fa_writer.writeRecord(_id_, _seq_) if fa_writer is not None: fa_writer.close() if fq_writer is not None: fq_writer.close() if ds_out_fn is not None: as_contigset(fa_out_fn, ds_out_fn)