def main(args): if not os.path.exists(args.input): print >> sys.stderr, "Input file {0} does not exist. Abort.".format( args.fasta) sys.exit(-1) if not os.path.exists(args.sam): print >> sys.stderr, "SAM file {0} does not exist. Abort.".format( args.sam) sys.exit(-1) # check for duplicate IDs check_ids_unique(args.input, is_fq=args.fq) ignored_fout = open(args.prefix + '.ignored_ids.txt', 'w') f_gff = open(args.prefix + '.collapsed.gff', 'w') f_txt = open(args.prefix + '.collapsed.group.txt', 'w') b = branch_simple2.BranchSimple(args.input, cov_threshold=1, min_aln_coverage=args.min_aln_coverage, min_aln_identity=args.min_aln_identity, is_fq=args.fq) iter = b.iter_gmap_sam(args.sam, ignored_fout) for recs in iter: for v in recs.itervalues(): if len(v) > 0: b.process_records(v, args.allow_extra_5exon, False, f_gff, f_gff, f_txt) ignored_fout.close() f_gff.close() f_txt.close() if args.fq: outfile = args.prefix + ".collapsed.rep.fq" else: outfile = args.prefix + ".collapsed.rep.fa" pick_rep(args.input, f_gff.name, f_txt.name, outfile, is_fq=args.fq) print >> sys.stderr, "Ignored IDs written to:", ignored_fout.name print >> sys.stderr, "Output written to:" print >> sys.stderr, f_gff.name print >> sys.stderr, f_txt.name print >> sys.stderr, outfile print >> sys.stderr, args
def fusion_main(fa_or_fq_filename, sam_filename, output_prefix, is_fq=False, allow_extra_5_exons=True, skip_5_exon_alt=True, prefix_dict_pickle_filename=None, min_locus_coverage=.05, min_total_coverage=.99, min_locus_coverage_bp=1, min_dist_between_loci=10000): """ (1) identify fusion candidates (based on mapping, total coverage, identity, etc) (2) group/merge the fusion exons, using an index to point to each individual part (3) use BranchSimple to write out a tmp GFF where PBfusion.1.1 is the first part of a fusion gene PBfusion.1.2 is the second part of a fusion gene (4) read the tmp file from <3> and modify it so that PBfusion.1 just represents the fusion gene (a single transcript GFF format) """ compressed_records_pointer_dict = defaultdict(lambda: []) merged_exons = [] merged_i = 0 # step (0). check for duplicate IDs check_ids_unique(fa_or_fq_filename, is_fq=is_fq) # step (1). identify fusion candidates bs = branch_simple2.BranchSimple(fa_or_fq_filename, is_fq=is_fq) fusion_candidates = find_fusion_candidates( sam_filename, bs.transfrag_len_dict, min_locus_coverage, min_locus_coverage_bp, min_total_coverage, min_dist_between_loci) # step (2). merge the fusion exons for recs in iter_gmap_sam_for_fusion(sam_filename, fusion_candidates, bs.transfrag_len_dict): for v in recs.itervalues(): if len(v) > 0: o = merge_fusion_exons(v, max_fusion_point_dist=100, max_exon_end_dist=0, allow_extra_5_exons=allow_extra_5_exons) for group in o: merged_exons.append(group) for r in group: compressed_records_pointer_dict[r.qID].append(merged_i) merged_i += 1 # step (3). use BranchSimple to write a temporary file # f_good = open(output_prefix + '.gff', 'w') f_group = open('branch_tmp.group.txt', 'w') # f_bad = f_good gene_index = 1 already_seen = set() for qid, indices in compressed_records_pointer_dict.iteritems(): combo = tuple(indices) if combo in already_seen: print "combo seen:", combo #raw_input("") continue already_seen.add(combo) # if gene_index == 7: # pdb.set_trace() for isoform_index, i in enumerate(indices): bs.cuff_index = gene_index # for set to the same records = merged_exons[i] f_group.write("{p}.{i}.{j}\t{ids}\n".format( p="PBfusion", i=gene_index, j=isoform_index, ids=",".join(r.qID for r in records))) # bs.process_records(records, allow_extra_5_exons, skip_5_exon_alt, \ # f_good, f_bad, f_group, tolerate_end=100, \ # starting_isoform_index=isoform_index, gene_prefix='PBfusion') gene_index += 1 # f_good.close() # f_bad.close() f_group.close() # step (4). read the tmp file and modify to display per fusion gene f_group = open(output_prefix + '.group.txt', 'w') group_info = {} # ex: PBfusion.1 --> [id1, id2, id3...] count = 0 with open('branch_tmp.group.txt') as f: while True: line = f.readline().strip() if len(line) == 0: break pbid1, groups1 = line.strip().split('\t') pbid2, groups2 = f.readline().strip().split('\t') assert pbid1.split('.')[1] == pbid2.split('.')[1] group = set(groups1.split(',')).intersection(groups2.split(',')) f_group.write("{0}\t{1}\n".format(pbid1[:pbid1.rfind('.')], ",".join(group))) group_info[pbid1[:pbid1.rfind('.')]] = list(group) count += 1 f_group.close() #os.remove('branch_tmp.group.txt') gff_filename = output_prefix + '.gff' group_filename = output_prefix + '.group.txt' if is_fq: output_filename = output_prefix + '.rep.fq' else: output_filename = output_prefix + '.rep.fa' pick_rep(fa_or_fq_filename, sam_filename, gff_filename, group_filename, output_filename, is_fq=is_fq, pick_least_err_instead=False) print >> sys.stderr, "{0} fusion candidates identified.".format(count) print >> sys.stderr, "Output written to: {0}.gff, {0}.group.txt, {1}".format( output_prefix, output_filename) # (optional) step 5. get count information if prefix_dict_pickle_filename is not None: with open(prefix_dict_pickle_filename) as f: d = load(f) d1 = d['HQ'] d1.update(d['LQ']) tofu_wrap.get_abundance(output_prefix, d1, output_prefix) print >> sys.stderr, "Count information written to: {0}.abundance.txt".format( output_prefix)
def main(args): if not os.path.exists(args.input): print >> sys.stderr, "Input file {0} does not exist. Abort.".format( args.fasta) sys.exit(-1) if not os.path.exists(args.sam): print >> sys.stderr, "SAM file {0} does not exist. Abort.".format( args.sam) sys.exit(-1) # check for duplicate IDs check_ids_unique(args.input, is_fq=args.fq) ignored_fout = open(args.prefix + '.ignored_ids.txt', 'w') if args.flnc_coverage > 0: f_good = open(args.prefix + '.collapsed.good.gff', 'w') f_bad = open(args.prefix + '.collapsed.bad.gff', 'w') cov_threshold = args.flnc_coverage else: f_good = open(args.prefix + '.collapsed.gff', 'w') f_bad = f_good cov_threshold = 1 f_txt = open(args.prefix + '.collapsed.group.txt', 'w') b = branch_simple2.BranchSimple(args.input, cov_threshold=cov_threshold, min_aln_coverage=args.min_aln_coverage, min_aln_identity=args.min_aln_identity, is_fq=args.fq) iter = b.iter_gmap_sam(args.sam, ignored_fout) for recs in iter: for v in recs.itervalues(): if len(v) > 0: b.process_records(v, args.allow_extra_5exon, False, f_good, f_bad, f_txt) ignored_fout.close() f_good.close() try: f_bad.close() except: pass f_txt.close() if args.max_fuzzy_junction > 0: # need to further collapse those that have fuzzy junctions! collapse_fuzzy_junctions( f_good.name, f_txt.name, args.allow_extra_5exon, internal_fuzzy_max_dist=args.max_fuzzy_junction) os.rename(f_good.name, f_good.name + '.unfuzzy') os.rename(f_txt.name, f_txt.name + '.unfuzzy') os.rename(f_good.name + '.fuzzy', f_good.name) os.rename(f_txt.name + '.fuzzy', f_txt.name) if args.fq: outfile = args.prefix + ".collapsed.rep.fq" else: outfile = args.prefix + ".collapsed.rep.fa" if args.allow_extra_5exon: # 5merge, pick longest pick_rep(args.input, f_good.name, f_txt.name, outfile, is_fq=args.fq, pick_least_err_instead=False, bad_gff_filename=f_bad.name) else: pick_rep(args.input, f_good.name, f_txt.name, outfile, is_fq=args.fq, pick_least_err_instead=True, bad_gff_filename=f_bad.name) print >> sys.stderr, "Ignored IDs written to:", ignored_fout.name print >> sys.stderr, "Output written to:" print >> sys.stderr, f_good.name print >> sys.stderr, f_txt.name print >> sys.stderr, outfile print >> sys.stderr, args