def scrub_sample_GFFs(sample_dirs, gff_filename, count_filename, group_filename, fastq_filename, output_prefix, tree): for sample_name, d in sample_dirs.items(): outf = open(os.path.join(d, output_prefix + '.gff.tmp'), 'w') for r in GFF.collapseGFFReader(os.path.join(d, gff_filename)): n = len(r.ref_exons) if n == 1: GFF.write_collapseGFF_format(outf, r) new_ref_exons = scrub_ref_exons(r, tree) if new_ref_exons is None: print("No changes made due to error:", r.seqid, file=sys.stderr) else: #print "before:", r.ref_exons #print "after :", new_ref_exons r.ref_exons = new_ref_exons GFF.write_collapseGFF_format(outf, r) outf.close() cleanup_scrubbed_files_redundancy(outf.name, \ os.path.join(d, group_filename), \ os.path.join(d, count_filename), \ os.path.join(d, fastq_filename) if fastq_filename is not None else None, os.path.join(d, output_prefix))
def main(): from argparse import ArgumentParser parser = ArgumentParser() parser.add_argument("input_prefix", help="Input prefix (ex: test.collapsed.min_fl_2)") parser.add_argument("--fuzzy_junction", type=int, default=5, help="Fuzzy junction max dist (default: 5bp)") args = parser.parse_args() output_prefix = args.input_prefix + '.filtered' count_filename, gff_filename, rep_filename, rep_type = sanity_check_collapse_input( args.input_prefix) recs = defaultdict(lambda: []) reader = GFF.collapseGFFReader(gff_filename) for r in reader: assert r.seqid.startswith('PB.') recs[int(r.seqid.split('.')[1])].append(r) good = [] f = open(output_prefix + '.gff', 'w') keys = list(recs.keys()) keys.sort() for k in recs: xxx = recs[k] filter_out_subsets(xxx, args.fuzzy_junction) for r in xxx: GFF.write_collapseGFF_format(f, r) good.append(r.seqid) f.close() # read abundance first d, count_header = read_count_file(count_filename) # write output rep.fq f = open(output_prefix + '.rep.' + ('fq' if rep_type == 'fastq' else 'fa'), 'w') for r in SeqIO.parse(open(rep_filename), rep_type): if r.name.split('|')[0] in good: SeqIO.write(r, f, rep_type) f.close() # write output to .abundance.txt f = open(output_prefix + '.abundance.txt', 'w') f.write(count_header) writer = DictWriter(f, fieldnames=['pbid','count_fl','count_nfl','count_nfl_amb','norm_fl','norm_nfl','norm_nfl_amb'], \ delimiter='\t', lineterminator='\n') writer.writeheader() for k in good: r = d[k] writer.writerow(r) f.close() print("Output written to:", output_prefix + '.gff', file=sys.stderr) print("Output written to:", rep_filename, file=sys.stderr) print("Output written to:", output_prefix + '.gff', file=sys.stderr)
def chain_split_file(ref_gff, ref_group, ref_name, addon_gff, addon_group, addon_name, fuzzy_junction, allow_5merge, max_3_diff, n_chunks): addon_group_info = sp.MegaPBTree.read_group(addon_group, None) recs = [] tree = OrderedDict() i = 0 for r in GFF.collapseGFFReader(addon_gff): if r.chr not in tree: tree[r.chr] = {'+': ClusterTree(0, 0), '-': ClusterTree(0, 0)} tree[r.chr][r.strand].insert(r.start, r.end, i) recs.append(r) i += 1 n = len(recs) chunk_size = (n // n_chunks) + (n % n_chunks > 0) split_files = [] i = 0 counter = 0 f_gff = open(addon_gff + '.split' + str(i), 'w') f_group = open(addon_group + '.split' + str(i), 'w') for v1 in tree.values(): for strand in ('+', '-'): v2 = v1[strand] for _start, _end, _indices in v2.getregions(): for cur in _indices: GFF.write_collapseGFF_format(f_gff, recs[cur]) f_group.write("{0}\t{1}\n".format( recs[cur].seqid, ",".join(addon_group_info[recs[cur].seqid]))) counter += 1 if counter >= (i + 1) * chunk_size: i += 1 f_gff.close() f_group.close() split_files.append((f_gff.name, f_group.name)) f_gff = open(addon_gff + '.split' + str(i), 'w') f_group = open(addon_group + '.split' + str(i), 'w') if not f_gff.closed: f_gff.close() f_group.close() split_files.append((f_gff.name, f_group.name)) result_prefixes = [] pools = [] for i, (split_gff, split_group) in enumerate(split_files): p = Process(target=chain_helper, args=(ref_gff, ref_group, split_gff, split_group, ref_name, addon_name + '.' + str(i), fuzzy_junction, allow_5merge, max_3_diff)) p.start() pools.append(p) result_prefixes.append((ref_name, addon_name + '.' + str(i))) for p in pools: p.join() return result_prefixes, split_files
def write_reclist_to_gff_n_info(rec_list, final_prefix, ref_name, addon_name, use_fq=False): # now go through the rec list and figure out in what order we are outputting the total records tree = defaultdict(lambda: {'+':ClusterTree(0,0), '-':ClusterTree(0,0)}) tree_keys_numeric = set() tree_keys_alpha = set() for i,match_rec in enumerate(rec_list): tree[match_rec.rec.chr][match_rec.rec.strand].insert(match_rec.rec.start, match_rec.rec.end, i) for chrom in tree: try: k = int(chrom) tree_keys_numeric.add(k) except ValueError: tree_keys_alpha.add(chrom) tree_keys = sorted(list(tree_keys_numeric)) + sorted(list(tree_keys_alpha)) f_gff = open(final_prefix+'.gff', 'w') f_info = open(final_prefix+'.mega_info.txt', 'w') writer_info = DictWriter(f_info, fieldnames=['superPBID', ref_name, addon_name], delimiter='\t') writer_info.writeheader() f_group = open(final_prefix+'.group.txt', 'w') if use_fq: f_fq = open(final_prefix+'.rep.fq', 'w') # sort the combined gff (tree) by chromosome and strand (- first) new_group_info = {} pb_i = 0 for _chr in tree_keys: # remember to convert potential integer chromsomes keys back to string now that we sorted them! _chr = str(_chr) for _strand in ('+', '-'): for _start,_end,_indices in tree[_chr][_strand].getregions(): # further sort these records by (start, end, num_exons) _indices.sort(key=lambda i: (rec_list[i].rec.start, rec_list[i].rec.end, len(rec_list[i].rec.ref_exons))) pb_i += 1 for pb_j, recs_index in enumerate(_indices): pbgene = "PB.{0}".format(pb_i) pbid = "PB.{0}.{1}".format(pb_i, pb_j + 1) match_rec = rec_list[recs_index] new_group_info[pbid] = match_rec.members match_rec.rec.seqid = pbid match_rec.rec.geneid = pbgene GFF.write_collapseGFF_format(f_gff, match_rec.rec) writer_info.writerow({'superPBID': pbid, ref_name: match_rec.ref_id, addon_name: match_rec.addon_id}) f_group.write("{0}\t{1}\n".format(pbid, ",".join(match_rec.members))) if use_fq: match_rec.seqrec.id = pbid match_rec.seqrec.description = '' SeqIO.write(match_rec.seqrec, f_fq, 'fastq') f_gff.close() f_info.close() f_group.close() if use_fq: f_fq.close() return new_group_info
def regroup_sam_to_gff(pooled_sam, demux_count_file, output_prefix, out_group_dict, in_fafq=None): """ :param pooled_sam: SAM file :param demux_count_file: comma-delimited per-barcode count file :param output_prefix: output prefix for GFF :param out_group_dict: dict of barcode name --> group to be long in (ex: {'EM1':'EM', 'EM2':'EM'}) :param in_fafq: optional fasta/fastq that was input to SAM """ if in_fafq is not None: type_fafq = get_type_fafq(in_fafq) in_tissue = defaultdict(lambda: set()) # pbid --> list of tissue it is in (EM, END, R) for r in DictReader(open(demux_count_file),delimiter=','): for k,v in r.iteritems(): if k=='id': continue if int(v) > 0: in_tissue[r['id']].add(k) in_tissue = dict(in_tissue) handles = {} handles_fafq = {} for g in out_group_dict.itervalues(): handles[g] = open("{o}_{g}_only.gff".format(o=output_prefix, g=g), 'w') if in_fafq is not None: handles_fafq[g] = open("{o}_{g}_only.{t}".format(o=output_prefix, g=g, t=type_fafq), 'w') if in_fafq is not None: fafq_dict = SeqIO.to_dict(SeqIO.parse(open(in_fafq), type_fafq)) fafq_dict_keys = fafq_dict.keys() for k in fafq_dict_keys: m = rex_pbid.match(k) if m is not None: fafq_dict[m.group(1)] = fafq_dict[k] reader = GMAPSAMReader(pooled_sam, True) for r in reader: if r.sID == '*': print >> sys.stderr, "Ignore {0} because unmapped.".format(r.qID) continue m = rex_pbid.match(r.qID) if m is not None: pbid = m.group(1) else: pbid = r.qID # convert SAM record to GFF record type r.seqid = pbid r.chr = r.sID r.start, r.end = r.sStart, r.sEnd r.strand = r.flag.strand r.ref_exons = r.segments r.cds_exons = None groups_to_write_in = set() if pbid not in in_tissue: print >> sys.stderr, "WARNING: {0} does not belong to any group indicated by outgroup_dict".format(pbid) for tissue in in_tissue[pbid]: groups_to_write_in.add(out_group_dict[tissue]) for g in groups_to_write_in: GFF.write_collapseGFF_format(handles[g], r) if in_fafq is not None: SeqIO.write(fafq_dict[pbid], handles_fafq[g], type_fafq)
def regroup_gff(pooled_gff, demux_count_file, output_prefix, out_group_dict, in_fafq=None): """ :param pooled_sam: SAM file :param demux_count_file: comma-delimited per-barcode count file :param output_prefix: output prefix for GFF :param out_group_dict: dict of barcode name --> group to be long in (ex: {'EM1':'EM', 'EM2':'EM'}) :param in_fafq: optional fasta/fastq that was input to SAM """ if in_fafq is not None: type_fafq = get_type_fafq(in_fafq) in_tissue = defaultdict( lambda: set()) # pbid --> list of tissue it is in (EM, END, R) for r in DictReader(open(demux_count_file), delimiter=','): for k, v in r.items(): if k == 'id': continue if int(v) > 0: in_tissue[r['id']].add(k) in_tissue = dict(in_tissue) handles = {} handles_fafq = {} for g in out_group_dict.values(): handles[g] = open("{o}_{g}_only.gff".format(o=output_prefix, g=g), 'w') if in_fafq is not None: handles_fafq[g] = open( "{o}_{g}_only.{t}".format(o=output_prefix, g=g, t=type_fafq), 'w') if in_fafq is not None: fafq_dict = SeqIO.to_dict(SeqIO.parse(open(in_fafq), type_fafq)) fafq_dict_keys = list(fafq_dict.keys()) for k in fafq_dict_keys: m = rex_pbid.match(k) if m is not None: fafq_dict[m.group(1)] = fafq_dict[k] reader = GFF.collapseGFFReader(pooled_gff) for r in reader: groups_to_write_in = set() pbid = r.seqid if pbid not in in_tissue: print( "WARNING: {0} does not belong to any group indicated by outgroup_dict" .format(pbid), file=sys.stderr) for tissue in in_tissue[pbid]: groups_to_write_in.add(out_group_dict[tissue]) for g in groups_to_write_in: GFF.write_collapseGFF_format(handles[g], r) if in_fafq is not None: SeqIO.write(fafq_dict[pbid], handles_fafq[g], type_fafq)
def main(): from argparse import ArgumentParser parser = ArgumentParser() parser.add_argument("input_prefix", help="Input prefix (ex: test.collapsed.min_fl_2)") parser.add_argument("--fuzzy_junction", type=int, default=5, help="Fuzzy junction max dist (default: 5bp)") args = parser.parse_args() output_prefix = args.input_prefix + '.filtered' count_filename, gff_filename, rep_filename = sanity_check_collapse_input(args.input_prefix) recs = defaultdict(lambda: []) reader = GFF.collapseGFFReader(gff_filename) for r in reader: assert r.seqid.startswith('PB.') recs[int(r.seqid.split('.')[1])].append(r) good = [] f = open(output_prefix + '.gff', 'w') keys = recs.keys() keys.sort() for k in recs: xxx = recs[k] filter_out_subsets(xxx, args.fuzzy_junction) for r in xxx: GFF.write_collapseGFF_format(f, r) good.append(r.seqid) f.close() # read abundance first d, count_header = read_count_file(count_filename) # write output rep.fq f = open(output_prefix + '.rep.fq', 'w') for r in SeqIO.parse(open(rep_filename), 'fastq'): if r.name.split('|')[0] in good: SeqIO.write(r, f, 'fastq') f.close() # write output to .abundance.txt f = open(output_prefix + '.abundance.txt', 'w') f.write(count_header) writer = DictWriter(f, fieldnames=['pbid','count_fl','count_nfl','count_nfl_amb','norm_fl','norm_nfl','norm_nfl_amb'], \ delimiter='\t', lineterminator='\n') writer.writeheader() for k in good: r = d[k] writer.writerow(r) f.close() print >> sys.stderr, "Output written to:", output_prefix + '.gff' print >> sys.stderr, "Output written to:", output_prefix + '.rep.fq' print >> sys.stderr, "Output written to:", output_prefix + '.gff'
def main(): from argparse import ArgumentParser parser = ArgumentParser() parser.add_argument("input_prefix", help="Input prefix (ex: test.collapsed.min_fl_2)") args = parser.parse_args() output_prefix = args.input_prefix + '.nomono' count_filename, gff_filename, rep_filename = sanity_check_collapse_input( args.input_prefix) good = [] f = open(output_prefix + '.gff', 'w') reader = GFF.collapseGFFReader(gff_filename) for r in reader: assert r.seqid.startswith('PB.') if len(r.ref_exons) > 1: good.append(r.seqid) GFF.write_collapseGFF_format(f, r) # read abundance first d, count_header = read_count_file(count_filename) # write output rep.fq f = open(output_prefix + '.rep.fq', 'w') for r in SeqIO.parse(open(rep_filename), 'fastq'): if r.name.split('|')[0] in good: SeqIO.write(r, f, 'fastq') f.close() # write output to .abundance.txt f = open(output_prefix + '.abundance.txt', 'w') f.write(count_header) writer = DictWriter(f, fieldnames=['pbid','count_fl','count_nfl','count_nfl_amb','norm_fl','norm_nfl','norm_nfl_amb'], \ delimiter='\t', lineterminator='\n') writer.writeheader() for k in good: r = d[k] writer.writerow(r) f.close() print >> sys.stderr, "Output written to:", output_prefix + '.gff' print >> sys.stderr, "Output written to:", output_prefix + '.rep.fq' print >> sys.stderr, "Output written to:", output_prefix + '.gff'
def main(): from argparse import ArgumentParser parser = ArgumentParser() parser.add_argument("input_prefix", help="Input prefix (ex: test.collapsed.min_fl_2)") args = parser.parse_args() output_prefix = args.input_prefix + '.nomono' count_filename, gff_filename, rep_filename = sanity_check_collapse_input(args.input_prefix) good = [] f = open(output_prefix + '.gff', 'w') reader = GFF.collapseGFFReader(gff_filename) for r in reader: assert r.seqid.startswith('PB.') if len(r.ref_exons) > 1: good.append(r.seqid) GFF.write_collapseGFF_format(f, r) # read abundance first d, count_header = read_count_file(count_filename) # write output rep.fq f = open(output_prefix + '.rep.fq', 'w') for r in SeqIO.parse(open(rep_filename), 'fastq'): if r.name.split('|')[0] in good: SeqIO.write(r, f, 'fastq') f.close() # write output to .abundance.txt f = open(output_prefix + '.abundance.txt', 'w') f.write(count_header) writer = DictWriter(f, fieldnames=['pbid','count_fl','count_nfl','count_nfl_amb','norm_fl','norm_nfl','norm_nfl_amb'], \ delimiter='\t', lineterminator='\n') writer.writeheader() for k in good: r = d[k] writer.writerow(r) f.close() print >> sys.stderr, "Output written to:", output_prefix + '.gff' print >> sys.stderr, "Output written to:", output_prefix + '.rep.fq' print >> sys.stderr, "Output written to:", output_prefix + '.gff'
def collapse_fuzzy_junctions(gff_filename, group_filename, allow_extra_5exon, internal_fuzzy_max_dist): def get_fl_from_id(members): try: # ex: 13cycle_1Mag1Diff|i0HQ_SIRV_1d1m|c139597/f1p0/178 return sum(int(_id.split('/')[1].split('p')[0][1:]) for _id in members) except ValueError: return 0 def can_merge(m, r1, r2): if m == 'exact': return True else: if not allow_extra_5exon: return False # below is continued only if (a) is 'subset' or 'super' AND (b) allow_extra_5exon is True if m == 'subset': r1, r2 = r2, r1 # rotate so r1 is always the longer one if m == 'super' or m == 'subset': n2 = len(r2.ref_exons) # check that (a) r1 and r2 end on same 3' exon, that is the last acceptor site agrees # AND (b) the 5' start of r2 is sandwiched between the matching r1 exon coordinates if r1.strand == '+': return abs(r1.ref_exons[-1].start - r2.ref_exons[-1].start) <= internal_fuzzy_max_dist and \ r1.ref_exons[-n2].start <= r2.ref_exons[0].start < r1.ref_exons[-n2].end else: return abs(r1.ref_exons[0].end - r2.ref_exons[0].end) <= internal_fuzzy_max_dist and \ r1.ref_exons[n2-1].start <= r2.ref_exons[-1].end < r1.ref_exons[n2].end return False d = {} recs = defaultdict(lambda: {'+':IntervalTree(), '-':IntervalTree()}) # chr --> strand --> tree fuzzy_match = defaultdict(lambda: []) for r in GFF.collapseGFFReader(gff_filename): d[r.seqid] = r has_match = False r.segments = r.ref_exons for r2 in recs[r.chr][r.strand].find(r.start, r.end): r2.segments = r2.ref_exons m = compare_junctions.compare_junctions(r, r2, internal_fuzzy_max_dist=internal_fuzzy_max_dist, max_5_diff=args.max_5_diff, max_3_diff=args.max_3_diff) if can_merge(m, r, r2): fuzzy_match[r2.seqid].append(r.seqid) has_match = True break if not has_match: recs[r.chr][r.strand].insert(r.start, r.end, r) fuzzy_match[r.seqid] = [r.seqid] group_info = {} with open(group_filename) as f: for line in f: pbid, members = line.strip().split('\t') group_info[pbid] = [x for x in members.split(',')] # pick for each fuzzy group the one that has the most exons (if tie, then most FL) keys = fuzzy_match.keys() keys.sort(key=lambda x: map(int, x.split('.')[1:])) f_gff = open(gff_filename+'.fuzzy', 'w') f_group = open(group_filename+'.fuzzy', 'w') for k in keys: all_members = [] best_pbid, best_size, best_num_exons = fuzzy_match[k][0], len(group_info[fuzzy_match[k][0]]), len(d[fuzzy_match[k][0]].ref_exons) all_members += group_info[fuzzy_match[k][0]] for pbid in fuzzy_match[k][1:]: # note: get_fl_from_id only works on IsoSeq1 and 2 ID formats, will return 0 if IsoSeq3 format or other _size = get_fl_from_id(group_info[pbid]) _num_exons = len(d[pbid].ref_exons) all_members += group_info[pbid] if _num_exons > best_num_exons or (_num_exons == best_num_exons and _size > best_size): best_pbid, best_size, best_num_exons = pbid, _size, _num_exons GFF.write_collapseGFF_format(f_gff, d[best_pbid]) f_group.write("{0}\t{1}\n".format(best_pbid, ",".join(all_members))) f_gff.close() f_group.close() return fuzzy_match
def collapse_fuzzy_junctions(gff_filename, group_filename, allow_extra_5exon, internal_fuzzy_max_dist): def get_fl_from_id(members): # ex: 13cycle_1Mag1Diff|i0HQ_SIRV_1d1m|c139597/f1p0/178 return sum(int(_id.split('/')[1].split('p')[0][1:]) for _id in members) def can_merge(m, r1, r2): if m == 'exact': return True else: if not allow_extra_5exon: return False # below is continued only if (a) is 'subset' or 'super' AND (b) allow_extra_5exon is True if m == 'subset': r1, r2 = r2, r1 # rotate so r1 is always the longer one if m == 'super' or m == 'subset': n2 = len(r2.ref_exons) # check that (a) r1 and r2 end on same 3' exon, that is the last acceptor site agrees # AND (b) the 5' start of r2 is sandwiched between the matching r1 exon coordinates if r1.strand == '+': return abs(r1.ref_exons[-1].start - r2.ref_exons[-1].start) <= internal_fuzzy_max_dist and \ r1.ref_exons[-n2].start <= r2.ref_exons[0].start < r1.ref_exons[-n2].end else: return abs(r1.ref_exons[0].end - r2.ref_exons[0].end) <= internal_fuzzy_max_dist and \ r1.ref_exons[n2-1].start <= r2.ref_exons[-1].end < r1.ref_exons[n2].end return False d = {} recs = defaultdict(lambda: { '+': IntervalTree(), '-': IntervalTree() }) # chr --> strand --> tree fuzzy_match = defaultdict(lambda: []) for r in GFF.collapseGFFReader(gff_filename): d[r.seqid] = r has_match = False r.segments = r.ref_exons for r2 in recs[r.chr][r.strand].find(r.start, r.end): r2.segments = r2.ref_exons m = compare_junctions.compare_junctions( r, r2, internal_fuzzy_max_dist=internal_fuzzy_max_dist) if can_merge(m, r, r2): fuzzy_match[r2.seqid].append(r.seqid) has_match = True break if not has_match: recs[r.chr][r.strand].insert(r.start, r.end, r) fuzzy_match[r.seqid] = [r.seqid] group_info = {} with open(group_filename) as f: for line in f: pbid, members = line.strip().split('\t') group_info[pbid] = [x for x in members.split(',')] # pick for each fuzzy group the one that has the most exons (if tie, then most FL) keys = fuzzy_match.keys() keys.sort(key=lambda x: map(int, x.split('.')[1:])) f_gff = open(gff_filename + '.fuzzy', 'w') f_group = open(group_filename + '.fuzzy', 'w') for k in keys: all_members = [] best_pbid, best_size, best_num_exons = fuzzy_match[k][0], len( group_info[fuzzy_match[k][0]]), len(d[fuzzy_match[k][0]].ref_exons) all_members += group_info[fuzzy_match[k][0]] for pbid in fuzzy_match[k][1:]: _size = get_fl_from_id(group_info[pbid]) _num_exons = len(d[pbid].ref_exons) all_members += group_info[pbid] if _num_exons > best_num_exons or (_num_exons == best_num_exons and _size > best_size): best_pbid, best_size, best_num_exons = pbid, _size, _num_exons GFF.write_collapseGFF_format(f_gff, d[best_pbid]) f_group.write("{0}\t{1}\n".format(best_pbid, ",".join(all_members))) f_gff.close() f_group.close() return fuzzy_match
def chain_split_file(ref_gff, ref_group, ref_name, addon_gff, addon_group, addon_name, fuzzy_junction, allow_5merge, max_3_diff, n_chunks): addon_group_info = sp.MegaPBTree.read_group(addon_group, None) recs = [] tree = OrderedDict() i = 0 for r in GFF.collapseGFFReader(addon_gff): if r.chr not in tree: tree[r.chr] = {'+': ClusterTree(0, 0), '-': ClusterTree(0, 0)} tree[r.chr][r.strand].insert(r.start, r.end, i) recs.append(r) i += 1 n = len(recs) chunk_size = (n // n_chunks) + (n % n_chunks > 0) #print("# of recs: {0}, cpus: {1}, chunk_size: {2}".format(n, n_chunks, chunk_size)) split_files = [] i = 0 counter = 0 f_gff = open(addon_gff + '.split' + str(i), 'w') f_group = open(addon_group + '.split' + str(i), 'w') for v1 in tree.values(): for strand in ('+', '-'): v2 = v1[strand] for _start, _end, _indices in v2.getregions(): for cur in _indices: GFF.write_collapseGFF_format(f_gff, recs[cur]) f_group.write("{0}\t{1}\n".format( recs[cur].seqid, ",".join(addon_group_info[recs[cur].seqid]))) counter += 1 # note: becuz we are limited by how the records are organized by (chrom, strand) # we may not end up using all the chunks, ex: if all records are on the same locus, we end up writing everything to one split file if counter >= (i + 1) * chunk_size: i += 1 f_gff.close() f_group.close() split_files.append((f_gff.name, f_group.name)) if i >= n_chunks or counter >= len(recs): break f_gff = open(addon_gff + '.split' + str(i), 'w') f_group = open(addon_group + '.split' + str(i), 'w') if not f_gff.closed: f_gff.close() f_group.close() split_files.append((f_gff.name, f_group.name)) result_prefixes = [] pools = [] for i, (split_gff, split_group) in enumerate(split_files): p = Process(target=chain_helper, args=(ref_gff, ref_group, split_gff, split_group, ref_name, addon_name + '.' + str(i), fuzzy_junction, allow_5merge, max_3_diff)) p.start() pools.append(p) result_prefixes.append((ref_name, addon_name + '.' + str(i))) for p in pools: p.join() #print("split files: {0}, result_prefix: {1}".format(split_files, result_prefixes)) return result_prefixes, split_files
def get_gff_from_list(gff_filename, listfile, partial_ok=False): seqs = [line.strip() for line in open(listfile)] for r in GFF.collapseGFFReader(gff_filename): if r.seqid in seqs or r.seqid.split('|')[0] in seqs or (partial_ok and any(r.seqid.startswith(x) for x in seqs)): GFF.write_collapseGFF_format(sys.stdout, r)
def cleanup_scrubbed_files_redundancy(gff_filename, group_filename, count_filename, fastq_filename, output_prefix): junction_seen = defaultdict(lambda: defaultdict(lambda: [ ])) # key (chr,strand) -> dict of (series of junctions) -> record for r in GFF.collapseGFFReader(gff_filename): n = len(r.ref_exons) if n == 1: junc_str = str(r.start) + ',' + str(r.end) junction_seen[r.chr, r.strand][junc_str] = [r] else: junc_str = ",".join( str(r.ref_exons[i].end) + ',' + str(r.ref_exons[i + 1].start) for i in range(n - 1)) junction_seen[r.chr, r.strand][junc_str].append(r) # write out cleaned GFF outf = open(output_prefix + '.gff', 'w') outf2 = open(output_prefix + '.merged_ids.txt', 'w') merged = {} keys = list(junction_seen.keys()) keys.sort() for k in keys: for bunch in junction_seen[k].values(): if len(bunch) == 1: # just one record, write it out r = bunch[0] GFF.write_collapseGFF_format(outf, r) merged[r.seqid] = [r.seqid] else: # find the representative r = bunch[0] for r2 in bunch[1:]: if r2.end - r2.start > r.end - r.start: r = r2 GFF.write_collapseGFF_format(outf, r) merged[r.seqid] = [x.seqid for x in bunch] outf2.write("{0}\t{1}\n".format(r.seqid, ",".join(merged[r.seqid]))) outf.close() outf2.close() count_d, count_header = read_count_file(count_filename) # write out count file outf = open(output_prefix + '.abundance.txt', 'w') outf.write(count_header) writer = DictWriter(outf, fieldnames=['pbid','count_fl','count_nfl','count_nfl_amb','norm_fl','norm_nfl','norm_nfl_amb'], \ delimiter='\t', lineterminator='\n') writer.writeheader() for pbid, bunch in merged.items(): # combine the counts r = count_d[bunch[0]] r['pbid'] = pbid for field in fields_to_add: r[field] = float(r[field]) for _id in bunch[1:]: for field in fields_to_add: r[field] += float(count_d[_id][field]) writer.writerow(r) outf.close() group_info = read_group_file(group_filename) # write out group file outf = open(output_prefix + '.group.txt', 'w') for pbid, bunch in merged.items(): # combine the groups g = [group_info[bunch[0]]] for _id in bunch[1:]: g.append(group_info[_id]) outf.write("{0}\t{1}\n".format(pbid, ",".join(g))) outf.close() # write out fastq file if present if fastq_filename is not None: outf = open(output_prefix + '.rep.fq', 'w') for r in SeqIO.parse(open(fastq_filename), 'fastq'): if r.id.split('|')[0] in merged or r.id in merged: SeqIO.write(r, outf, 'fastq') outf.close() print( "scrubbed files written: {0}.gff, {0}.group.txt, {0}.abundance.txt, {0}.merged_ids.txt" .format(output_prefix), file=sys.stderr)
def main(): from argparse import ArgumentParser parser = ArgumentParser() parser.add_argument("input_prefix", help="Input prefix (ex: filtered.microexon)") parser.add_argument( "--micro_exon_size", type=int, default=12, help="Filter away microexons < micro_exon_size (default: 12bp)") args = parser.parse_args() output_prefix = args.input_prefix + '.filtered' args = parser.parse_args() output_prefix = args.input_prefix + '.filtered.microexon' count_filename, gff_filename, rep_filename = sanity_check_collapse_input( args.input_prefix) recs = defaultdict(lambda: []) reader = GFF.collapseGFFReader(gff_filename) for r in reader: assert r.seqid.startswith('PB.') recs[int(r.seqid.split('.')[1])].append(r) good = [] f = open(output_prefix + '.gff', 'w') keys = recs.keys() keys.sort() for k in recs: xxx = recs[k] for r in xxx: min_exon_size = min(e.end - e.start for e in r.ref_exons) if min_exon_size > 12: # minimum exon must be > default 12 bp GFF.write_collapseGFF_format(f, r) good.append(r.seqid) f.close() # read abundance first d, count_header = read_count_file(count_filename) # write output rep.fq f = open(output_prefix + '.rep.fq', 'w') for r in SeqIO.parse(open(rep_filename), 'fastq'): if r.name.split('|')[0] in good: SeqIO.write(r, f, 'fastq') f.close() # write output to .abundance.txt f = open(output_prefix + '.abundance.txt', 'w') f.write(count_header) writer = DictWriter(f, fieldnames=['pbid','count_fl','count_nfl','count_nfl_amb','norm_fl','norm_nfl','norm_nfl_amb'], \ delimiter='\t', lineterminator='\n') writer.writeheader() for k in good: r = d[k] writer.writerow(r) f.close() print >> sys.stderr, "Output written to:", output_prefix + '.gff' print >> sys.stderr, "Output written to:", output_prefix + '.rep.fq' print >> sys.stderr, "Output written to:", output_prefix + '.gff'
def filter_by_count(input_prefix, output_prefix, min_count, dun_use_group_count=False): group_filename = input_prefix + '.group.txt' count_filename = input_prefix + '.abundance.txt' gff_filename = input_prefix + '.gff' rep_filename = input_prefix + '.rep.fq' if not dun_use_group_count: # read group group_max_count_fl = {} group_max_count_p = {} f = open(group_filename) for line in f: #ex: PB.1.1 i0HQ_54b0ca|c58773/f30p16/700 pbid, members = line.strip().split('\t') group_max_count_fl[pbid] = 0 group_max_count_p[pbid] = 0 members = members.split(',') for m in members: i = m.find('|') if i > 0: tmp = m.split('|')[1].split('/')[1] #ex: tmp = f30p16 else: tmp = m.split('/')[1] fl_count, p_count = tmp.split('p') fl_count = int(fl_count[1:]) p_count = int(p_count) group_max_count_fl[pbid] = max(group_max_count_fl[pbid], fl_count) group_max_count_p[pbid] = max(group_max_count_p[pbid], p_count) f.close() # read abundance first f = open(count_filename) count_header = '' while True: cur_pos = f.tell() line = f.readline() if not line.startswith('#'): f.seek(cur_pos) break else: count_header += line d = dict((r['pbid'], r) for r in DictReader(f, delimiter='\t')) for k, v in d.items(): print(k, v) f.close() # group_max_count_p NOT used for now good = [ x for x in d if int(d[x]['count_fl']) >= min_count and ( dun_use_group_count or group_max_count_fl[x] >= min_count) ] # write output GFF f = open(output_prefix + '.gff', 'w') for r in GFF.collapseGFFReader(gff_filename): if r.seqid in good: GFF.write_collapseGFF_format(f, r) f.close() # write output rep.fq f = open(output_prefix + '.rep.fq', 'w') for r in SeqIO.parse(open(rep_filename), 'fastq'): if r.name.split('|')[0] in good: SeqIO.write(r, f, 'fastq') f.close() # write output to .abundance.txt f = open(output_prefix + '.abundance.txt', 'w') f.write(count_header) writer = DictWriter(f, fieldnames=['pbid','count_fl','count_nfl','count_nfl_amb','norm_fl','norm_nfl','norm_nfl_amb'], \ delimiter='\t', lineterminator='\n') writer.writeheader() for k in good: r = d[k] writer.writerow(r) f.close() print("Output written to:", output_prefix + '.gff', file=sys.stderr) print("Output written to:", output_prefix + '.rep.fq', file=sys.stderr) print("Output written to:", output_prefix + '.abundance.txt', file=sys.stderr)
def main(): from argparse import ArgumentParser parser = ArgumentParser() parser.add_argument("input_prefix", help="Input prefix (ex: test.collapsed.min_fl_2)") parser.add_argument("--fuzzy_junction", type=int, default=5, help="Fuzzy junction max dist (default: 5bp)") args = parser.parse_args() output_prefix = args.input_prefix + '.filtered' #group_filename = args.input_prefix + '.group.txt' count_filename = args.input_prefix + '.abundance.txt' gff_filename = args.input_prefix + '.gff' rep_filename = args.input_prefix + '.rep.fq' if not os.path.exists(count_filename): print >> sys.stderr, "File {0} does not exist. Abort!".format( count_filename) sys.exit(-1) if not os.path.exists(gff_filename): print >> sys.stderr, "File {0} does not exist. Abort!".format( gff_filename) sys.exit(-1) if not os.path.exists(rep_filename): print >> sys.stderr, "File {0} does not exist. Abort!".format( rep_filename) sys.exit(-1) recs = defaultdict(lambda: []) reader = GFF.collapseGFFReader(gff_filename) for r in reader: assert r.seqid.startswith('PB.') recs[int(r.seqid.split('.')[1])].append(r) good = [] f = open(output_prefix + '.gff', 'w') keys = recs.keys() keys.sort() for k in recs: xxx = recs[k] filter_out_subsets(xxx, args.fuzzy_junction) for r in xxx: GFF.write_collapseGFF_format(f, r) good.append(r.seqid) f.close() # read abundance first f = open(count_filename) count_header = '' while True: cur_pos = f.tell() line = f.readline() if not line.startswith('#'): f.seek(cur_pos) break else: count_header += line d = dict((r['pbid'], r) for r in DictReader(f, delimiter='\t')) for k, v in d.iteritems(): print k, v f.close() # write output rep.fq f = open(output_prefix + '.rep.fq', 'w') for r in SeqIO.parse(open(rep_filename), 'fastq'): if r.name.split('|')[0] in good: SeqIO.write(r, f, 'fastq') f.close() # write output to .abundance.txt f = open(output_prefix + '.abundance.txt', 'w') f.write(count_header) writer = DictWriter(f, fieldnames=['pbid','count_fl','count_nfl','count_nfl_amb','norm_fl','norm_nfl','norm_nfl_amb'], \ delimiter='\t', lineterminator='\n') writer.writeheader() for k in good: r = d[k] writer.writerow(r) f.close() print >> sys.stderr, "Output written to:", output_prefix + '.gff' print >> sys.stderr, "Output written to:", output_prefix + '.rep.fq' print >> sys.stderr, "Output written to:", output_prefix + '.gff'
def filter_by_count(input_prefix, output_prefix, min_count, dun_use_group_count=False): group_filename = input_prefix + '.group.txt' count_filename = input_prefix + '.abundance.txt' gff_filename = input_prefix + '.gff' rep_filename = input_prefix + '.rep.fq' if not dun_use_group_count: # read group group_max_count_fl = {} group_max_count_p = {} f = open(group_filename) for line in f: #ex: PB.1.1 i0HQ_54b0ca|c58773/f30p16/700 pbid, members = line.strip().split('\t') group_max_count_fl[pbid] = 0 group_max_count_p[pbid] = 0 members = members.split(',') for m in members: i = m.find('|') if i > 0: tmp = m.split('|')[1].split('/')[1] #ex: tmp = f30p16 else: tmp = m.split('/')[1] fl_count, p_count = tmp.split('p') fl_count = int(fl_count[1:]) p_count = int(p_count) group_max_count_fl[pbid] = max(group_max_count_fl[pbid], fl_count) group_max_count_p[pbid] = max(group_max_count_p[pbid], p_count) f.close() # read abundance first f = open(count_filename) count_header = '' while True: cur_pos = f.tell() line = f.readline() if not line.startswith('#'): f.seek(cur_pos) break else: count_header += line d = dict((r['pbid'], r) for r in DictReader(f, delimiter='\t')) for k,v in d.iteritems(): print k,v f.close() # group_max_count_p NOT used for now good = filter(lambda x: int(d[x]['count_fl']) >= min_count and (dun_use_group_count or group_max_count_fl[x] >= min_count), d) # write output GFF f = open(output_prefix + '.gff', 'w') for r in GFF.collapseGFFReader(gff_filename): if r.seqid in good: GFF.write_collapseGFF_format(f, r) f.close() # write output rep.fq f = open(output_prefix + '.rep.fq', 'w') for r in SeqIO.parse(open(rep_filename), 'fastq'): if r.name.split('|')[0] in good: SeqIO.write(r, f, 'fastq') f.close() # write output to .abundance.txt f = open(output_prefix + '.abundance.txt', 'w') f.write(count_header) writer = DictWriter(f, fieldnames=['pbid','count_fl','count_nfl','count_nfl_amb','norm_fl','norm_nfl','norm_nfl_amb'], \ delimiter='\t', lineterminator='\n') writer.writeheader() for k in good: r = d[k] writer.writerow(r) f.close() print >> sys.stderr, "Output written to:", output_prefix + '.gff' print >> sys.stderr, "Output written to:", output_prefix + '.rep.fq' print >> sys.stderr, "Output written to:", output_prefix + '.abundance.txt'