def add_sample(self, gff_filename, group_filename, sample_prefix, output_prefix): combined = [] # list of (r1 if r2 is None | r2 if r1 is None | longer of r1 or r2 if both not None) unmatched_recs = self.record_d.keys() for r in GFF.collapseGFFReader(gff_filename): match_rec = self.match_record_to_tree(r) if match_rec is not None: # found a match! put longer of r1/r2 in combined.append((match_rec, r)) try: unmatched_recs.remove(match_rec.seqid) except ValueError: pass # already deleted, OK, this happens for single-exon transcripts else: # r is not present in current tree combined.append((None, r)) # put whatever is left from the tree in for seqid in unmatched_recs: combined.append((self.record_d[seqid], None)) # create a ClusterTree to re-calc the loci/transcripts final_tree = defaultdict(lambda: {'+': ClusterTree(0, 0), '-':ClusterTree(0, 0)}) for i,(r1,r2) in enumerate(combined): if r2 is None or (r1 is not None and r1.end-r1.start > r2.end-r2.start): final_tree[r1.chr][r1.strand].insert(r1.start, r1.end, i) else: final_tree[r2.chr][r2.strand].insert(r2.start, r2.end, i) self.write_cluster_tree_as_gff(final_tree, combined, group_filename, sample_prefix, output_prefix)
def add_sample(self, gff_filename, group_filename, sample_prefix, output_prefix): combined = [ ] # list of (r1 if r2 is None | r2 if r1 is None | longer of r1 or r2 if both not None) unmatched_recs = self.record_d.keys() for r in GFF.collapseGFFReader(gff_filename): match_rec = self.match_record_to_tree(r) if match_rec is not None: # found a match! put longer of r1/r2 in combined.append((match_rec, r)) try: unmatched_recs.remove(match_rec.seqid) except ValueError: pass # already deleted, OK, this happens for single-exon transcripts else: # r is not present in current tree combined.append((None, r)) # put whatever is left from the tree in for seqid in unmatched_recs: combined.append((self.record_d[seqid], None)) # create a ClusterTree to re-calc the loci/transcripts final_tree = defaultdict(lambda: { '+': ClusterTree(0, 0), '-': ClusterTree(0, 0) }) for i, (r1, r2) in enumerate(combined): if r2 is None or (r1 is not None and r1.end - r1.start > r2.end - r2.start): final_tree[r1.chr][r1.strand].insert(r1.start, r1.end, i) else: final_tree[r2.chr][r2.strand].insert(r2.start, r2.end, i) self.write_cluster_tree_as_gff(final_tree, combined, group_filename, sample_prefix, output_prefix)
def __init__(self, gff_filename, group_filename, self_prefix=None): self.gff_filename = gff_filename self.group_filename = group_filename self.self_prefix = self_prefix self.record_d = dict((r.seqid, r) for r in GFF.collapseGFFReader(gff_filename)) self.tree = defaultdict(lambda: {'+':IntervalTree(), '-':IntervalTree()}) # chr --> strand --> tree self.read_gff_as_interval_tree() self.group_info = MegaPBTree.read_group(self.group_filename, self.self_prefix) # ex: PB.1.1 --> [ RatHeart|i3_c123.... ]
def __init__(self, gff_filename, group_filename, internal_fuzzy_max_dist=0, self_prefix=None): self.gff_filename = gff_filename self.group_filename = group_filename self.self_prefix = self_prefix self.internal_fuzzy_max_dist = internal_fuzzy_max_dist self.record_d = dict((r.seqid, r) for r in GFF.collapseGFFReader(gff_filename)) self.tree = defaultdict(lambda: {'+':IntervalTree(), '-':IntervalTree()}) # chr --> strand --> tree #print >> sys.stderr, "self.internal_fuzzy_max_dist is", internal_fuzzy_max_dist #raw_input() self.read_gff_as_interval_tree() self.group_info = MegaPBTree.read_group(self.group_filename, self.self_prefix) # ex: PB.1.1 --> [ RatHeart|i3_c123.... ]
def __init__(self, gff_filename, group_filename, internal_fuzzy_max_dist=0, self_prefix=None): self.gff_filename = gff_filename self.group_filename = group_filename self.self_prefix = self_prefix self.internal_fuzzy_max_dist = internal_fuzzy_max_dist self.record_d = dict( (r.seqid, r) for r in GFF.collapseGFFReader(gff_filename)) self.tree = defaultdict(lambda: { '+': IntervalTree(), '-': IntervalTree() }) # chr --> strand --> tree #print >> sys.stderr, "self.internal_fuzzy_max_dist is", internal_fuzzy_max_dist #raw_input() self.read_gff_as_interval_tree() self.group_info = MegaPBTree.read_group( self.group_filename, self.self_prefix) # ex: PB.1.1 --> [ RatHeart|i3_c123.... ]
def filter_by_count(input_prefix, output_prefix, min_count): group_filename = input_prefix + '.group.txt' count_filename = input_prefix + '.abundance.txt' gff_filename = input_prefix + '.gff' rep_filename = input_prefix + '.rep.fq' # read group group_max_count_fl = {} group_max_count_p = {} f = open(group_filename) for line in f: #ex: PB.1.1 i0HQ_54b0ca|c58773/f30p16/700 pbid, members = line.strip().split('\t') group_max_count_fl[pbid] = 0 group_max_count_p[pbid] = 0 members = members.split(',') for m in members: tmp = m.split('|')[1].split('/')[1] #ex: tmp = f30p16 fl_count, p_count = tmp.split('p') fl_count = int(fl_count[1:]) p_count = int(p_count) group_max_count_fl[pbid] = max(group_max_count_fl[pbid], fl_count) group_max_count_p[pbid] = max(group_max_count_p[pbid], p_count) f.close() # read abundance first f = open(count_filename) count_header = '' while True: cur_pos = f.tell() line = f.readline() if not line.startswith('#'): f.seek(cur_pos) break else: count_header += line d = dict((r['pbid'], r) for r in DictReader(f, delimiter='\t')) for k,v in d.iteritems(): print k,v f.close() # group_max_count_p NOT used for now good = filter(lambda x: int(d[x]['count_fl']) >= min_count and group_max_count_fl[x] >= min_count and group_max_count_p >= 0, d) # write output GFF f = open(output_prefix + '.gff', 'w') for r in GFF.collapseGFFReader(gff_filename): if r.seqid in good: GFF.write_collapseGFF_format(f, r) f.close() # write output rep.fq f = FastqWriter(output_prefix + '.rep.fq') for r in FastqReader(rep_filename): if r.name.split('|')[0] in good: f.writeRecord(r) f.close() # write output to .abundance.txt f = open(output_prefix + '.abundance.txt', 'w') f.write(count_header) writer = DictWriter(f, fieldnames=['pbid','count_fl','count_nfl','count_nfl_amb','norm_fl','norm_nfl','norm_nfl_amb'], \ delimiter='\t', lineterminator='\n') writer.writeheader() for k in good: r = d[k] writer.writerow(r) f.close()
def main(): from argparse import ArgumentParser parser = ArgumentParser() parser.add_argument("input_prefix", help="Input prefix") parser.add_argument("output_prefix", help="Output prefix") parser.add_argument("--fuzzy_junction", type=int, default=5, help="Fuzzy junction max dist (default: 5bp)") args = parser.parse_args() #group_filename = args.input_prefix + '.group.txt' count_filename = args.input_prefix + '.abundance.txt' gff_filename = args.input_prefix + '.gff' rep_filename = args.input_prefix + '.rep.fq' recs = defaultdict(lambda: []) reader = GFF.collapseGFFReader(gff_filename) for r in reader: assert r.seqid.startswith('PB.') recs[int(r.seqid.split('.')[1])].append(r) good = [] f = open(args.output_prefix + '.gff', 'w') keys = recs.keys() keys.sort() for k in recs: xxx = recs[k] filter_out_subsets(xxx, args.fuzzy_junction) for r in xxx: GFF.write_collapseGFF_format(f, r) good.append(r.seqid) f.close() # read abundance first f = open(count_filename) count_header = '' while True: cur_pos = f.tell() line = f.readline() if not line.startswith('#'): f.seek(cur_pos) break else: count_header += line d = dict((r['pbid'], r) for r in DictReader(f, delimiter='\t')) for k,v in d.iteritems(): print k,v f.close() # write output rep.fq f = FastqWriter(args.output_prefix + '.rep.fq') for r in FastqReader(rep_filename): if r.name.split('|')[0] in good: f.writeRecord(r) f.close() # write output to .abundance.txt f = open(args.output_prefix + '.abundance.txt', 'w') f.write(count_header) writer = DictWriter(f, fieldnames=['pbid','count_fl','count_nfl','count_nfl_amb','norm_fl','norm_nfl','norm_nfl_amb'], \ delimiter='\t', lineterminator='\n') writer.writeheader() for k in good: r = d[k] writer.writerow(r) f.close()
def collapse_fuzzy_junctions(gff_filename, group_filename, allow_extra_5exon, internal_fuzzy_max_dist): def get_fl_from_id(members): # ex: 13cycle_1Mag1Diff|i0HQ_SIRV_1d1m|c139597/f1p0/178 return sum(int(_id.split('/')[1].split('p')[0][1:]) for _id in members) def can_merge(m, r1, r2): if m == 'exact': return True else: if not allow_extra_5exon: return False # below is continued only if (a) is 'subset' or 'super' AND (b) allow_extra_5exon is True if m == 'subset': r1, r2 = r2, r1 # rotate so r1 is always the longer one if m == 'super' or m == 'subset': n2 = len(r2.ref_exons) # check that (a) r1 and r2 end on same 3' exon, that is the last acceptor site agrees # AND (b) the 5' start of r2 is sandwiched between the matching r1 exon coordinates if r1.strand == '+': return abs(r1.ref_exons[-1].start - r2.ref_exons[-1].start) <= internal_fuzzy_max_dist and \ r1.ref_exons[-n2].start <= r2.ref_exons[0].start < r1.ref_exons[-n2].end else: return abs(r1.ref_exons[0].end - r2.ref_exons[0].end) <= internal_fuzzy_max_dist and \ r1.ref_exons[n2-1].start <= r2.ref_exons[-1].end < r1.ref_exons[n2].end return False d = {} recs = defaultdict(lambda: { '+': IntervalTree(), '-': IntervalTree() }) # chr --> strand --> tree fuzzy_match = defaultdict(lambda: []) for r in GFF.collapseGFFReader(gff_filename): d[r.seqid] = r has_match = False r.segments = r.ref_exons for r2 in recs[r.chr][r.strand].find(r.start, r.end): r2.segments = r2.ref_exons m = compare_junctions.compare_junctions( r, r2, internal_fuzzy_max_dist=internal_fuzzy_max_dist) if can_merge(m, r, r2): fuzzy_match[r2.seqid].append(r.seqid) has_match = True break if not has_match: recs[r.chr][r.strand].insert(r.start, r.end, r) fuzzy_match[r.seqid] = [r.seqid] group_info = {} with open(group_filename) as f: for line in f: pbid, members = line.strip().split('\t') group_info[pbid] = [x for x in members.split(',')] # pick for each fuzzy group the one that has the most exons (if tie, then most FL) keys = fuzzy_match.keys() keys.sort(key=lambda x: map(int, x.split('.')[1:])) f_gff = open(gff_filename + '.fuzzy', 'w') f_group = open(group_filename + '.fuzzy', 'w') for k in keys: all_members = [] best_pbid, best_size, best_num_exons = fuzzy_match[k][0], len( group_info[fuzzy_match[k][0]]), len(d[fuzzy_match[k][0]].ref_exons) all_members += group_info[fuzzy_match[k][0]] for pbid in fuzzy_match[k][1:]: _size = get_fl_from_id(group_info[pbid]) _num_exons = len(d[pbid].ref_exons) all_members += group_info[pbid] if _num_exons > best_num_exons or (_num_exons == best_num_exons and _size > best_size): best_pbid, best_size, best_num_exons = pbid, _size, _num_exons GFF.write_collapseGFF_format(f_gff, d[best_pbid]) f_group.write("{0}\t{1}\n".format(best_pbid, ",".join(all_members))) f_gff.close() f_group.close() return fuzzy_match
def read_gff_as_interval_tree(self): """ Read a collapsed GFF file into an IntervalTree """ for r in GFF.collapseGFFReader(self.gff_filename): self.tree[r.chr][r.strand].insert(r.start, r.end, r)
def collapse_fuzzy_junctions(gff_filename, group_filename, allow_extra_5exon, internal_fuzzy_max_dist): def get_fl_from_id(members): # ex: 13cycle_1Mag1Diff|i0HQ_SIRV_1d1m|c139597/f1p0/178 return sum(int(_id.split('/')[1].split('p')[0][1:]) for _id in members) def can_merge(m, r1, r2): if m == 'exact': return True else: if not allow_extra_5exon: return False # below is continued only if (a) is 'subset' or 'super' AND (b) allow_extra_5exon is True if m == 'subset': r1, r2 = r2, r1 # rotate so r1 is always the longer one if m == 'super' or m == 'subset': n2 = len(r2.ref_exons) # check that (a) r1 and r2 end on same 3' exon, that is the last acceptor site agrees # AND (b) the 5' start of r2 is sandwiched between the matching r1 exon coordinates if r1.strand == '+': return abs(r1.ref_exons[-1].start - r2.ref_exons[-1].start) <= internal_fuzzy_max_dist and \ r1.ref_exons[-n2].start <= r2.ref_exons[0].start < r1.ref_exons[-n2].end else: return abs(r1.ref_exons[0].end - r2.ref_exons[0].end) <= internal_fuzzy_max_dist and \ r1.ref_exons[n2-1].start <= r2.ref_exons[-1].end < r1.ref_exons[n2].end return False d = {} recs = defaultdict(lambda: {'+':IntervalTree(), '-':IntervalTree()}) # chr --> strand --> tree fuzzy_match = defaultdict(lambda: []) for r in GFF.collapseGFFReader(gff_filename): d[r.seqid] = r has_match = False r.segments = r.ref_exons for r2 in recs[r.chr][r.strand].find(r.start, r.end): r2.segments = r2.ref_exons m = compare_junctions.compare_junctions(r, r2, internal_fuzzy_max_dist=internal_fuzzy_max_dist) if can_merge(m, r, r2): fuzzy_match[r2.seqid].append(r.seqid) has_match = True break if not has_match: recs[r.chr][r.strand].insert(r.start, r.end, r) fuzzy_match[r.seqid] = [r.seqid] group_info = {} with open(group_filename) as f: for line in f: pbid, members = line.strip().split('\t') group_info[pbid] = [x for x in members.split(',')] # pick for each fuzzy group the one that has the most exons (if tie, then most FL) keys = fuzzy_match.keys() keys.sort(key=lambda x: map(int, x.split('.')[1:])) f_gff = open(gff_filename+'.fuzzy', 'w') f_group = open(group_filename+'.fuzzy', 'w') for k in keys: all_members = [] best_pbid, best_size, best_num_exons = fuzzy_match[k][0], len(group_info[fuzzy_match[k][0]]), len(d[fuzzy_match[k][0]].ref_exons) all_members += group_info[fuzzy_match[k][0]] for pbid in fuzzy_match[k][1:]: _size = get_fl_from_id(group_info[pbid]) _num_exons = len(d[pbid].ref_exons) all_members += group_info[pbid] if _num_exons > best_num_exons or (_num_exons == best_num_exons and _size > best_size): best_pbid, best_size, best_num_exons = pbid, _size, _num_exons GFF.write_collapseGFF_format(f_gff, d[best_pbid]) f_group.write("{0}\t{1}\n".format(best_pbid, ",".join(all_members))) f_gff.close() f_group.close() return fuzzy_match
def filter_by_count(input_prefix, output_prefix, min_count): group_filename = input_prefix + ".group.txt" count_filename = input_prefix + ".abundance.txt" gff_filename = input_prefix + ".gff" rep_filename = input_prefix + ".rep.fq" # read group group_max_count_fl = {} group_max_count_p = {} f = open(group_filename) for line in f: # ex: PB.1.1 i0HQ_54b0ca|c58773/f30p16/700 pbid, members = line.strip().split("\t") group_max_count_fl[pbid] = 0 group_max_count_p[pbid] = 0 members = members.split(",") for m in members: tmp = m.split("|")[1].split("/")[1] # ex: tmp = f30p16 fl_count, p_count = tmp.split("p") fl_count = int(fl_count[1:]) p_count = int(p_count) group_max_count_fl[pbid] = max(group_max_count_fl[pbid], fl_count) group_max_count_p[pbid] = max(group_max_count_p[pbid], p_count) f.close() # read abundance first f = open(count_filename) count_header = "" while True: cur_pos = f.tell() line = f.readline() if not line.startswith("#"): f.seek(cur_pos) break else: count_header += line d = dict((r["pbid"], r) for r in DictReader(f, delimiter="\t")) for k, v in d.iteritems(): print k, v f.close() # group_max_count_p NOT used for now good = filter( lambda x: int(d[x]["count_fl"]) >= min_count and group_max_count_fl[x] >= min_count and group_max_count_p >= 0, d, ) # write output GFF f = open(output_prefix + ".gff", "w") for r in GFF.collapseGFFReader(gff_filename): if r.seqid in good: GFF.write_collapseGFF_format(f, r) f.close() # write output rep.fq f = FastqWriter(output_prefix + ".rep.fq") for r in FastqReader(rep_filename): if r.name.split("|")[0] in good: f.writeRecord(r) f.close() # write output to .abundance.txt f = open(output_prefix + ".abundance.txt", "w") f.write(count_header) writer = DictWriter( f, fieldnames=["pbid", "count_fl", "count_nfl", "count_nfl_amb", "norm_fl", "norm_nfl", "norm_nfl_amb"], delimiter="\t", lineterminator="\n", ) writer.writeheader() for k in good: r = d[k] writer.writerow(r) f.close()