def test_collapse_fuzzy_junctions(self): """Test collapse_fuzzy_junctions, can_merge and compare_fuzzy_junctions.""" test_name = "collapse_fuzzy_junctions" input_gff = op.join(_DAT_DIR_, "input_%s.gff" % test_name) input_group = op.join(_DAT_DIR_, "input_%s.group.txt" % test_name) output_gff = op.join(_OUT_DIR_, "output_%s.gff" % test_name) output_group = op.join(_OUT_DIR_, "output_%s.group.txt" % test_name) records = [r for r in CollapseGffReader(input_gff)] self.assertEqual(len(records), 4) r0, r1, r2, r3 = records # comparing r0 and r1 m = compare_fuzzy_junctions(r0.ref_exons, r1.ref_exons, max_fuzzy_junction=5) self.assertEqual(m, "subset") self.assertTrue(can_merge(m, r0, r1, allow_extra_5exon=True, max_fuzzy_junction=5)) # comparing r2 and r3 m = compare_fuzzy_junctions(r2.ref_exons, r3.ref_exons, max_fuzzy_junction=5) self.assertEqual(m, "exact") self.assertTrue(can_merge(m, r2, r3, allow_extra_5exon=True, max_fuzzy_junction=5)) # call collapse_fuzzy_junctions and write fuzzy output. collapse_fuzzy_junctions(gff_filename=input_gff, group_filename=input_group, fuzzy_gff_filename=output_gff, fuzzy_group_filename=output_group, allow_extra_5exon=True, max_fuzzy_junction=5) r4, r5 = [r for r in CollapseGffReader(output_gff)] self.assertEqual(r1, r4) self.assertEqual(r3, r5)
def test_good_isoform_ids_by_removing_subsets(self): """Test good_isoform_ids_by_removing_subsets""" all = [r.seqid for r in CollapseGffReader(GFF_FN)] good = good_isoform_ids_by_removing_subsets(in_gff_filename=GFF_FN, max_fuzzy_junction=5) diff = list(set(all) - set(good)) self.assertEqual(diff, self.expected_diff)
def __init__(self, gff_filename, group_filename, self_prefix=None, max_fuzzy_junction=0): self.gff_filename = gff_filename self.group_filename = group_filename self.self_prefix = self_prefix self.max_fuzzy_junction = max_fuzzy_junction self.record_d = dict( (r.seqid, r) for r in CollapseGffReader(gff_filename)) self.tree = read_gff_as_interval_tree( gff_filename=self.gff_filename) # chr --> strand -->tree # ex: PB.1.1 --> [ RatHeart|i3_c123.... ] self.group_info = MegaPBTree.read_group(self.group_filename, self.self_prefix) # keep track of gff|group files that has been added. self._sample_prefixes = [] self._group_filenames = [] self._gff_filenames = [] self._add_sample_files(gff_filename=gff_filename, group_filename=group_filename, sample_prefix="first_sample")
def read_gff_as_interval_tree(gff_filename): """ Read a collapsed GFF file into an IntervalTree """ tree = defaultdict(lambda: { '+': IntervalTree(), '-': IntervalTree() }) # chr --> strand --> tree for r in CollapseGffReader(gff_filename): tree[r.chr][r.strand].insert(r.start, r.end, r) return tree
def test_filter_out_subsets(self): """Test filter_out_subsets""" out_abundance_fn = op.join(_OUT_DIR_, "filter_out_subsets.abundance.txt") out_gff_fn = op.join(_OUT_DIR_, "filter_out_subsets.gff") out_rep_fn = op.join(_OUT_DIR_, "filter_out_subsets.rep.fastq") filter_out_subsets(in_abundance_filename=ABUNDANCE_FN, in_gff_filename=GFF_FN, in_rep_filename=REP_FN, out_abundance_filename=out_abundance_fn, out_gff_filename=out_gff_fn, out_rep_filename=out_rep_fn, max_fuzzy_junction=5) all = [r.seqid for r in CollapseGffReader(GFF_FN)] expected_good = set(all)-set(self.expected_diff) out_abundance_ids = [r.pbid for r in AbundanceReader(out_abundance_fn)] self.assertEqual(set(out_abundance_ids), expected_good) out_gff_ids = [r.seqid for r in CollapseGffReader(out_gff_fn)] self.assertEqual(set(out_gff_ids), expected_good) out_rep_ids = [r.name.split('|')[0] for r in FastqReader(out_rep_fn)] self.assertEqual(set(out_rep_ids), expected_good)
def run_after(self, rtc, output_dir): rep_fn = rtc.task.output_files[0] gff_fn = rtc.task.output_files[1] abundance_fn = rtc.task.output_files[2] group_fn = rtc.task.output_files[3] read_stat_fn = rtc.task.output_files[4] from pbcore.io import FastqReader from pbtranscript.io import CollapseGffReader, AbundanceReader, GroupReader, ReadStatReader self.assertEqual(len([r for r in FastqReader(rep_fn)]), 65) self.assertEqual(len([r for r in CollapseGffReader(gff_fn)]), 65) self.assertEqual(len([r for r in AbundanceReader(abundance_fn)]), 65) self.assertEqual(len([r for r in GroupReader(group_fn)]), 86) self.assertEqual(len([r for r in ReadStatReader(read_stat_fn)]), 10873)
def add_sample(self, gff_filename, group_filename, sample_prefix, o_gff_fn, o_group_fn, o_mega_fn): """Add one more sample to this MagaPBTree object. Read gff file to get collapsed isoforms from new sample, combine with existing collapsed isoforms and update tree. """ self._add_sample_files(gff_filename=gff_filename, group_filename=group_filename, sample_prefix=sample_prefix) # list of (r1 if r2 is None | r2 if r1 is None | longer of r1 or r2 if # both not None) combined = [] unmatched_recs = self.record_d.keys() for r in CollapseGffReader(gff_filename): match_rec = self.match_record_to_tree(r) if match_rec is not None: # found a match! put longer of r1/r2 in combined.append((match_rec, r)) try: unmatched_recs.remove(match_rec.seqid) except ValueError: pass # already deleted, OK, this happens for single-exon transcripts else: # r is not present in current tree combined.append((None, r)) # put whatever is left from the tree in for seqid in unmatched_recs: combined.append((self.record_d[seqid], None)) # create a ClusterTree to re-calc the loci/transcripts final_tree = defaultdict(lambda: { '+': ClusterTree(0, 0), '-': ClusterTree(0, 0) }) for i, (r1, r2) in enumerate(combined): if r2 is None or (r1 is not None and r1.end - r1.start > r2.end - r2.start): final_tree[r1.chr][r1.strand].insert(r1.start, r1.end, i) else: final_tree[r2.chr][r2.strand].insert(r2.start, r2.end, i) self.write_cluster_tree_as_gff(final_tree, combined, group_filename, sample_prefix, o_gff_fn, o_group_fn, o_mega_fn)
def test_filter_by_count(self): """Test filter_by_count""" out_abundance_fn = op.join(_OUT_DIR_, "filter_by_count.abundance.txt") out_gff_fn = op.join(_OUT_DIR_, "filter_by_count.gff") out_rep_fn = op.join(_OUT_DIR_, "filter_by_count.rep.fastq") filter_by_count(in_group_filename=GROUP_FN, in_abundance_filename=ABUNDANCE_FN, in_gff_filename=GFF_FN, in_rep_filename=REP_FN, out_abundance_filename=out_abundance_fn, out_gff_filename=out_gff_fn, out_rep_filename=out_rep_fn, min_count=20) out_abundance_ids = [r.pbid for r in AbundanceReader(out_abundance_fn)] self.assertEqual(out_abundance_ids, self.expected_good) out_gff_ids = [r.seqid for r in CollapseGffReader(out_gff_fn)] self.assertEqual(out_gff_ids, self.expected_good) out_rep_ids = [r.name.split('|')[0] for r in FastqReader(out_rep_fn)] self.assertEqual(out_rep_ids, self.expected_good)
def good_isoform_ids_by_removing_subsets(in_gff_filename, max_fuzzy_junction): """Return a list of collapsed isoforms ids by removing isoforms which are a subset of any other isoform. Parameters: in_gff_filename -- input collapsed gff file """ recs_dict = defaultdict(lambda: []) with CollapseGffReader(in_gff_filename) as gff_reader: for r in gff_reader: assert r.seqid.startswith('PB.') recs_dict[int(r.seqid.split('.')[1])].append(r) good = [] keys = recs_dict.keys() keys.sort() for k in recs_dict: recs = recs_dict[k] remove_subset_isoforms_from_list(recs, max_fuzzy_junction=max_fuzzy_junction) for r in recs: good.append(r.seqid) return good
def write_good_collapsed_isoforms(in_abundance_filename, in_gff_filename, in_rep_filename, out_abundance_filename, out_gff_filename, out_rep_filename, good): """Write good collapsed isoforms.""" in_suffix = parse_ds_filename(in_rep_filename)[1] out_suffix = parse_ds_filename(out_rep_filename)[1] if in_suffix != out_suffix: raise ValueError("Format of input %s and output %s must match." % (in_rep_filename, out_rep_filename)) if in_suffix not in ("fasta", "fastq"): raise ValueError( "Format of input %s and output %s must be either FASTA or FASTQ." % (in_rep_filename, out_rep_filename)) # then read gff, and write good gff record. with CollapseGffWriter(out_gff_filename) as gff_writer: for r in CollapseGffReader(in_gff_filename): if r.seqid in good: gff_writer.writeRecord(r) # next read rep fasta/fastq, and write good rep fasta/fastq record. rep_reader = FastaReader(in_rep_filename) if in_suffix == "fasta" \ else FastqReader(in_rep_filename) rep_writer = FastaWriter(out_rep_filename) if in_suffix == "fasta" \ else FastqWriter(out_rep_filename) for r in rep_reader: # r.name e.g., PB.1.1|PB.1.1:10712-11643(+)|i0_HQ_sample18ba5d|c1543/f8p1/465 if r.name.split('|')[0] in good: rep_writer.writeRecord(r) # finally write abundance info of good records. with AbundanceReader(in_abundance_filename) as a_reader, \ AbundanceWriter(out_abundance_filename, comments=a_reader.comments) as a_writer: for r in a_reader: if r.pbid in good: a_writer.writeRecord(r)
def pick_rep(isoform_filename, gff_filename, group_filename, output_filename, pick_least_err_instead=False, bad_gff_filename=None): """ For each group of collapsed sam records, select the representative record. If is FASTA file -- then always pick the longest one If is FASTQ file -- then If pick_least_err_instead is True, pick the one w/ least number of expected base errors Else, pick the longest one """ fd = None is_fq = False dummy_prefix, _suffix = parse_ds_filename(isoform_filename) if _suffix == "fasta": fd = FastaRandomReader(isoform_filename) elif _suffix == "fastq": fd = FastqRandomReader(isoform_filename) is_fq = True elif _suffix == "contigset.xml": fd = ContigSet(isoform_filename) _fns = fd.toExternalFiles() if len(_fns) == 1 and _fns[0].endswith(".fq") or _fns[0].endswith( ".fastq"): fd = FastqRandomReader(_fns[0]) is_fq = True else: if not fd.isIndexed: # Must be indexed FASTA, or exactly contains one FASTQ file raise IOError( "%s must contain either indexed FASTA files or " % isoform_filename + "contain exactly one FASTQ file!") else: raise IOError("Unable to recognize file type of %s." % isoform_filename) fa_out_fn, fq_out_fn, ds_out_fn = None, None, None _prefix, _suffix = parse_ds_filename(output_filename) if _suffix == "fasta": fa_out_fn = output_filename elif _suffix == "fastq": if not is_fq: raise ValueError("Input file %s is not FASTQ while output is." % isoform_filename) else: fq_out_fn = output_filename elif _suffix == "contigset.xml": # output is contigset.xml ds_out_fn = output_filename fa_out_fn = _prefix + ".fasta" if is_fq: fq_out_fn = _prefix + ".fastq" else: raise IOError("Unable to recognize file type of %s." % output_filename) fa_writer = FastaWriter(fa_out_fn) if fa_out_fn is not None else None fq_writer = FastqWriter(fq_out_fn) if fq_out_fn is not None else None coords = {} for r in CollapseGffReader(gff_filename): tid = r.transcript_id coords[tid] = "{0}:{1}-{2}({3})".format(r.seqid, r.start, r.end, r.strand) if bad_gff_filename is not None: for r in CollapseGffReader(gff_filename): tid = r.transcript_id coords[tid] = "{0}:{1}-{2}({3})".format(r.seqid, r.start, r.end, r.strand) for group in GroupReader(group_filename): pb_id, members = group.name, group.members if not pb_id in coords: raise ValueError("Could not find %s in %s and %s" % (pb_id, gff_filename, bad_gff_filename)) #logging.info("Picking representative sequence for %s", pb_id) best_id = None best_seq = None best_qual = None best_err = 9999999 err = 9999999 max_len = 0 for x in members: if is_fq and pick_least_err_instead: err = sum(i**-(i / 10.) for i in fd[x].quality) if (is_fq and pick_least_err_instead and err < best_err) or \ ((not is_fq or not pick_least_err_instead) and len(fd[x].sequence) >= max_len): best_id = x best_seq = fd[x].sequence if is_fq: best_qual = fd[x].quality best_err = err max_len = len(fd[x].sequence) _id_ = "{0}|{1}|{2}".format(pb_id, coords[pb_id], best_id) _seq_ = best_seq if fq_writer is not None: fq_writer.writeRecord(_id_, _seq_, best_qual) if fa_writer is not None: fa_writer.writeRecord(_id_, _seq_) if fa_writer is not None: fa_writer.close() if fq_writer is not None: fq_writer.close() if ds_out_fn is not None: as_contigset(fa_out_fn, ds_out_fn)
def collapse_fuzzy_junctions(gff_filename, group_filename, fuzzy_gff_filename, fuzzy_group_filename, allow_extra_5exon, max_fuzzy_junction): """ Collapses those transcripts in gff_filename which have fuzzy junctions. Returns fuzzy_match Parameters: gff_filename -- input unfuzzy gff filename group_filename -- input unfuzzy group filename fuzzy_gff_filename -- output gff filename in which transcripts with fuzzy junctions are further collapsed. fuzzy_group_filename -- output group filename allow_etra_5exon -- whether or not to allow extra 5 exons max_fuzzy_junction -- maximum differences to call two exons match """ d = {} # seqid --> GmapRecord recs = defaultdict(lambda: { '+': IntervalTree(), '-': IntervalTree() }) # chr --> strand --> tree fuzzy_match = defaultdict( lambda: []) # seqid --> [seqid of fuzzy match GmapRecords] for r in CollapseGffReader(gff_filename): # r : a GmapRecord which represents a transcript and its associated exons. d[r.seqid] = r has_match = False for r2 in recs[r.chr][r.strand].find(r.start, r.end): # Compare r1 with r2 and get match pattern, exact, super, subset, partial or nonmatch m = compare_fuzzy_junctions(r.ref_exons, r2.ref_exons, max_fuzzy_junction=max_fuzzy_junction) if can_merge(m, r, r2, allow_extra_5exon=allow_extra_5exon, max_fuzzy_junction=max_fuzzy_junction): logging.debug("Collapsing fuzzy transcript %s to %s", r.seqid, r2.seqid) fuzzy_match[r2.seqid].append(r.seqid) # collapse r to r2 has_match = True break if not has_match: logging.debug("No fuzzy transcript found for %s", r.seqid) recs[r.chr][r.strand].insert(r.start, r.end, r) fuzzy_match[r.seqid] = [r.seqid] # Get group info from input group_filename group_info = { group.name: group.members for group in GroupReader(group_filename) } # pick for each fuzzy group the one that has the most exons (if tie, then most FL) keys = fuzzy_match.keys() keys.sort(key=lambda x: map(int, x.split('.')[1:])) fuzzy_gff_writer = CollapseGffWriter(fuzzy_gff_filename) fuzzy_group_writer = GroupWriter(fuzzy_group_filename) for k in keys: # Iterates over each group of fuzzy match GmapRecords all_members = [] # Assume the first GmapRecord is the best to represent this fuzzy match GmapRecords group best_pbid = fuzzy_match[k][0] # e.g., PB.1.1 if not best_pbid in group_info: raise ValueError("Could not find %s in Group file %s" % (best_pbid, group_filename)) best_size, best_num_exons = len(group_info[best_pbid]), len( d[best_pbid].ref_exons) all_members += group_info[best_pbid] for pbid in fuzzy_match[k][ 1:]: # continue to look for better representative if not pbid in group_info: raise ValueError("Could not find %s in Group file %s" % (pbid, group_filename)) _size = get_fl_from_id(group_info[pbid]) _num_exons = len(d[pbid].ref_exons) all_members += group_info[pbid] if _num_exons > best_num_exons or (_num_exons == best_num_exons and _size > best_size): best_pbid, best_size, best_num_exons = pbid, _size, _num_exons # Write the best GmapRecord of the group to fuzzy_gff_filename fuzzy_gff_writer.writeRecord(d[best_pbid]) # Write all members of the group to fuzzy_group_filename fuzzy_group_writer.writeRecord(GroupRecord(best_pbid, all_members)) fuzzy_gff_writer.close() fuzzy_group_writer.close() return fuzzy_match