def main(args): ### sanity check that input file and input SAM exists if not os.path.exists(args.input): print >> sys.stderr, "Input file {0} does not exist. Abort.".format( args.fasta) sys.exit(-1) if not os.path.exists(args.sam): print >> sys.stderr, "SAM file {0} does not exist. Abort.".format( args.sam) sys.exit(-1) # check for duplicate IDs check_ids_unique(args.input, is_fq=args.fq) ignored_fout = open(args.prefix + '.ignored_ids.txt', 'w') if args.flnc_coverage > 0: f_good = open(args.prefix + '.collapsed.good.gff', 'w') f_bad = open(args.prefix + '.collapsed.bad.gff', 'w') cov_threshold = args.flnc_coverage else: f_good = open(args.prefix + '.collapsed.gff', 'w') f_bad = f_good cov_threshold = 1 f_txt = open(args.prefix + '.collapsed.group.txt', 'w') b = branch_simple2.BranchSimple(args.input, cov_threshold=cov_threshold, min_aln_coverage=args.min_aln_coverage, min_aln_identity=args.min_aln_identity, is_fq=args.fq) iter = b.iter_gmap_sam(args.sam, ignored_fout) for recs in iter: # recs is {'+': list of list of records, '-': list of list of records} for v in recs.itervalues(): for v2 in v: if len(v2) > 0: b.process_records(v2, args.allow_extra_5exon, False, f_good, f_bad, f_txt) ignored_fout.close() f_good.close() try: f_bad.close() except: pass f_txt.close() if args.max_fuzzy_junction > 0: # need to further collapse those that have fuzzy junctions! collapse_fuzzy_junctions( f_good.name, f_txt.name, args.allow_extra_5exon, internal_fuzzy_max_dist=args.max_fuzzy_junction) os.rename(f_good.name, f_good.name + '.unfuzzy') os.rename(f_txt.name, f_txt.name + '.unfuzzy') os.rename(f_good.name + '.fuzzy', f_good.name) os.rename(f_txt.name + '.fuzzy', f_txt.name) if args.fq: outfile = args.prefix + ".collapsed.rep.fq" else: outfile = args.prefix + ".collapsed.rep.fa" if args.allow_extra_5exon: # 5merge, pick longest pick_rep(args.input, f_good.name, f_txt.name, outfile, is_fq=args.fq, pick_least_err_instead=False, bad_gff_filename=f_bad.name) else: pick_rep(args.input, f_good.name, f_txt.name, outfile, is_fq=args.fq, pick_least_err_instead=True, bad_gff_filename=f_bad.name) print >> sys.stderr, "Ignored IDs written to:", ignored_fout.name print >> sys.stderr, "Output written to:" print >> sys.stderr, f_good.name print >> sys.stderr, f_txt.name print >> sys.stderr, outfile print >> sys.stderr, args
def fusion_main(fa_or_fq_filename, sam_filename, output_prefix, cluster_report_csv=None, is_fq=False, allow_extra_5_exons=True, skip_5_exon_alt=True, min_locus_coverage=.05, min_total_coverage=.99, min_locus_coverage_bp=1, min_dist_between_loci=10000, min_identity=0.95, is_flnc=False): """ (1) identify fusion candidates (based on mapping, total coverage, identity, etc) (2) group/merge the fusion exons, using an index to point to each individual part (3) use BranchSimple to write out a tmp GFF where PBfusion.1.1 is the first part of a fusion gene PBfusion.1.2 is the second part of a fusion gene (4) read the tmp file from <3> and modify it so that PBfusion.1 just represents the fusion gene (a single transcript GFF format) """ compressed_records_pointer_dict = defaultdict(lambda: []) merged_exons = [] merged_i = 0 # step (0). check for duplicate IDs check_ids_unique(fa_or_fq_filename, is_fq=is_fq) # step (1). identify fusion candidates bs = branch_simple2.BranchSimple(fa_or_fq_filename, is_fq=is_fq) fusion_candidates = find_fusion_candidates(sam_filename, bs.transfrag_len_dict, min_locus_coverage, min_locus_coverage_bp, min_total_coverage, min_dist_between_loci, min_identity=min_identity) # step (2). merge the fusion exons for recs in iter_gmap_sam_for_fusion(sam_filename, fusion_candidates, bs.transfrag_len_dict): for v in recs.values(): if len(v) > 0: o = merge_fusion_exons(v, max_fusion_point_dist=100, max_exon_end_dist=0, allow_extra_5_exons=allow_extra_5_exons) for group in o: merged_exons.append(group) for r in group: compressed_records_pointer_dict[r.qID].append(merged_i) merged_i += 1 # step (3). use BranchSimple to write a temporary file f_group = open('branch_tmp.group.txt', 'w') gene_index = 1 already_seen = set() for qid, indices in compressed_records_pointer_dict.items(): combo = tuple(indices) if combo in already_seen: #print("combo seen:", combo) continue already_seen.add(combo) # print the fusion transcript in order of the transcription for i in indices: bs.cuff_index = gene_index # for set to the same records = merged_exons[i] isoform_index = get_isoform_index(fusion_candidates[qid], records[0].sID, records[0].sStart, records[0].sEnd) f_group.write("{p}.{i}.{j}\t{ids}\n".format( p="PBfusion", i=gene_index, j=isoform_index + 1, ids=",".join(r.qID for r in records))) gene_index += 1 f_group.close() # step (4). read the tmp file and modify to display per fusion gene # IMPORTANT: sometimes a fusion can involve more than 2 loci! f_group = open(output_prefix + '.group.txt', 'w') group_info = {} # ex: PBfusion.1 --> [id1, id2, id3...] count = 0 with open('branch_tmp.group.txt') as f: while True: line = f.readline().strip() if len(line) == 0: break pbid1, groups1 = line.strip().split('\t') group = set(groups1.split(',')) while True: cur_pos = f.tell() line = f.readline().strip() if len(line) == 0: break new_pbid, new_group = line.strip().split('\t') if new_pbid.split('.')[1] != pbid1.split('.')[1]: f.seek(cur_pos) break else: # still in the same fusion group group = group.intersection(new_group.split(',')) f_group.write("{0}\t{1}\n".format(pbid1[:pbid1.rfind('.')], ",".join(group))) group_info[pbid1[:pbid1.rfind('.')]] = list(group) count += 1 f_group.close() #os.remove('branch_tmp.group.txt') gff_filename = output_prefix + '.gff' group_filename = output_prefix + '.group.txt' if is_fq: output_filename = output_prefix + '.rep.fq' else: output_filename = output_prefix + '.rep.fa' pick_rep(fa_or_fq_filename, sam_filename, gff_filename, group_filename, output_filename, fusion_candidates, is_fq=is_fq) print("{0} fusion candidates identified.".format(count), file=sys.stdout) print("Output written to: {0}.gff, {0}.group.txt, {1}".format( output_prefix, output_filename), file=sys.stdout) # (optional) step 5. get count information if cluster_report_csv is not None: get_abundance_post_collapse(output_prefix, cluster_report_csv, output_prefix) print("Count information written to: {0}.abundance.txt".format( output_prefix), file=sys.stdout) elif is_flnc: print("Input is FLNC. Outputting FLNC counts per fusion.") with open(output_prefix + '.abundance.txt', 'w') as f: f.write("pbid\tcount_fl\n") for pbid, members in group_info.items(): f.write("{0}\t{1}\n".format(pbid, len(members))) print("Count information written to: {0}.abundance.txt".format( output_prefix), file=sys.stdout)
def main(args): ### sanity check that input file and input SAM exists if not os.path.exists(args.input): print("Input file {0} does not exist. Abort.".format(args.input), file=sys.stderr) sys.exit(-1) if not os.path.exists(args.sam): print("SAM file {0} does not exist. Abort.".format(args.sam), file=sys.stderr) sys.exit(-1) # check for duplicate IDs check_ids_unique(args.input, is_fq=args.fq) ignored_fout = open(args.prefix + '.ignored_ids.txt', 'w') if args.flnc_coverage > 0: f_good = open(args.prefix + '.collapsed.good.gff', 'w') f_bad = open(args.prefix + '.collapsed.bad.gff', 'w') cov_threshold = args.flnc_coverage else: f_good = open(args.prefix + '.collapsed.gff', 'w') f_bad = f_good cov_threshold = 1 f_txt = open(args.prefix + '.collapsed.group.txt', 'w') b = branch_simple2.BranchSimple(args.input, cov_threshold=cov_threshold, min_aln_coverage=args.min_aln_coverage, min_aln_identity=args.min_aln_identity, is_fq=args.fq, max_5_diff=args.max_5_diff, max_3_diff=args.max_3_diff) iter = b.iter_gmap_sam(args.sam, ignored_fout) for recs in iter: # recs is {'+': list of list of records, '-': list of list of records} for v in recs.values(): for v2 in v: if len(v2) > 0: b.process_records(v2, args.allow_extra_5exon, False, f_good, f_bad, f_txt) ignored_fout.close() f_good.close() try: f_bad.close() except: pass f_txt.close() if args.max_fuzzy_junction > 0: # need to further collapse those that have fuzzy junctions! collapse_fuzzy_junctions( f_good.name, f_txt.name, args.allow_extra_5exon, internal_fuzzy_max_dist=args.max_fuzzy_junction) os.rename(f_good.name, f_good.name + '.unfuzzy') os.rename(f_txt.name, f_txt.name + '.unfuzzy') os.rename(f_good.name + '.fuzzy', f_good.name) os.rename(f_txt.name + '.fuzzy', f_txt.name) if args.fq: outfile = args.prefix + ".collapsed.rep.fq" else: outfile = args.prefix + ".collapsed.rep.fa" if args.allow_extra_5exon: # 5merge, pick longest pick_rep(args.input, f_good.name, f_txt.name, outfile, is_fq=args.fq, pick_least_err_instead=False, bad_gff_filename=f_bad.name) else: pick_rep(args.input, f_good.name, f_txt.name, outfile, is_fq=args.fq, pick_least_err_instead=True, bad_gff_filename=f_bad.name) if args.gen_mol_count: outfile = args.prefix + '.collapsed.abundance.txt' with open(outfile, 'w') as f: f.write("pbid\tcount_fl\n") for line in open(f_txt.name): pbid, members = line.strip().split() f.write("{0}\t{1}\n".format(pbid, members.count(',') + 1)) print("Ignored IDs written to: {0}".format(ignored_fout.name), file=sys.stdout) print("Output written to: {0}\n{1}\n{2}\n{3}\n".format( f_good.name, f_txt.name, outfile, args), file=sys.stdout)
def main( input_filename: str = typer.Option(..., "--input", help="Input FA/FQ filename"), sam: str = typer.Option(..., help="Sorted SAM filename"), fq: bool = typer.Option(False, "--fq", help="Input is a fastq file"), # store_true prefix: str = typer.Option(..., "-p", "--prefix", help="Output filename prefix"), min_aln_coverage: float = typer.Option(0.99, "--min-coverage", "-c", help="Minimum alignment coverage"), min_aln_identity: float = typer.Option(0.95, "--min-identity", "-i", help="Minimum alignment identity"), max_fuzzy_junction: int = typer.Option(5, help="Max fuzzy junction dist"), max_5_diff: int = typer.Option( 1000, help="Maximum allowed 5' difference if on same exon"), max_3_diff: int = typer.Option( 100, help="Maximum allowed 3' difference if on same exon"), flnc_coverage: int = typer.Option( -1, help= "Minimum # of FLNC reads, only use this for aligned FLNC reads, otherwise results undefined!", ), gen_mol_count: bool = typer.Option( False, help= "Generate a .abundance.txt file based on the number of input sequences collapsed. Use only if input is FLNC or UMI-dedup output", ), # store_true allow_extra_5exon: bool = typer.Option( True, "--dun-merge-5-shorter", help="Don't collapse shorter 5' transcripts (default: turned off)", ), # store_false version: bool = typer.Option( None, "--version", callback=version_callback, is_eager=True, help="Prints the version of the SQANTI3 package.", ), ) -> None: # sanity check that input file and input SAM exists if not Path(str(input_filename)).exists(): raise FileNotFoundError( f"Input file {input_filename} does not exist. Abort.") if not Path(sam).exists(): raise FileNotFoundError(f"SAM file {sam} does not exist. Abort.") # check for duplicate IDs check_ids_unique(input_filename, is_fq=fq) with open(f"{prefix}.ignored_ids.txt", "w") as ignored_fout: if flnc_coverage > 0: # keep these files closed *until* we need to write to them f_good = Path(f"{prefix}.collapsed.good.gff") f_bad = Path(f"{prefix}.collapsed.bad.gff") cov_threshold = flnc_coverage else: f_good = Path(f"{prefix}.collapsed.gff") f_bad = f_good cov_threshold = 1 f_txt = Path(f"{prefix}.collapsed.group.txt") b = branch_simple2.BranchSimple( transfrag_filename=input_filename, cov_threshold=cov_threshold, min_aln_coverage=min_aln_coverage, min_aln_identity=min_aln_identity, is_fq=fq, max_5_diff=max_5_diff, max_3_diff=max_3_diff, ) rec_iter = b.iter_gmap_sam(sam, ignored_fout) # recs is {'+': list of list of records, '-': list of list of records} for recs in rec_iter: for v in recs.values(): for v2 in v: if len(v2) > 0: b.process_records( records=v2, allow_extra_5_exons=allow_extra_5exon, skip_5_exon_alt=False, f_good=f_good, f_bad=f_bad, f_group=f_txt, ) # need to further collapse those that have fuzzy junctions! if max_fuzzy_junction > 0: collapse_fuzzy_junctions( f_good, f_txt, allow_extra_5exon, internal_fuzzy_max_dist=max_fuzzy_junction, max_5_diff=max_5_diff, max_3_diff=max_3_diff, ) Path(f_good.name).rename(f"{f_good.name}.unfuzzy") Path(f_txt.name).rename(f"{f_txt.name}.unfuzzy") Path(f"{f_good.name}.fuzzy").rename(f_good.name) Path(f"{f_txt.name}.fuzzy").rename(f_txt.name) if fq: outfile = f"{prefix}.collapsed.rep.fq" else: outfile = f"{prefix}.collapsed.rep.fa" if allow_extra_5exon: # 5merge, pick longest pick_rep( fa_fq_filename=input_filename, gff_filename=f_good, group_filename=f_txt, output_filename=outfile, is_fq=fq, pick_least_err_instead=False, bad_gff_filename=f_bad.name, ) else: pick_rep( fa_fq_filename=input_filename, gff_filename=f_good, group_filename=f_txt, output_filename=outfile, is_fq=fq, pick_least_err_instead=True, bad_gff_filename=f_bad.name, ) if gen_mol_count: outfile = f"{prefix}.collapsed.abundance.txt" with open(outfile, "w") as f: f.write("pbid\tcount_fl\n") for line in open(f_txt.name): pbid, members = line.strip().split() f.write(f"{pbid}\t{members.count(',')+1}\n") logger.info(f"Ignored IDs written to: {ignored_fout.name}") logger.info(f"Output written to: {f_good.name}\n{f_txt.name}\n{outfile}\n")
def fusion_main(fa_or_fq_filename, sam_filename, output_prefix, cluster_report_csv=None, is_fq=False, allow_extra_5_exons=True, skip_5_exon_alt=True, prefix_dict_pickle_filename=None, min_locus_coverage=.05, min_total_coverage=.99, min_locus_coverage_bp=1, min_dist_between_loci=10000): """ (1) identify fusion candidates (based on mapping, total coverage, identity, etc) (2) group/merge the fusion exons, using an index to point to each individual part (3) use BranchSimple to write out a tmp GFF where PBfusion.1.1 is the first part of a fusion gene PBfusion.1.2 is the second part of a fusion gene (4) read the tmp file from <3> and modify it so that PBfusion.1 just represents the fusion gene (a single transcript GFF format) """ compressed_records_pointer_dict = defaultdict(lambda: []) merged_exons = [] merged_i = 0 # step (0). check for duplicate IDs check_ids_unique(fa_or_fq_filename, is_fq=is_fq) # step (1). identify fusion candidates bs = branch_simple2.BranchSimple(fa_or_fq_filename, is_fq=is_fq) fusion_candidates = find_fusion_candidates( sam_filename, bs.transfrag_len_dict, min_locus_coverage, min_locus_coverage_bp, min_total_coverage, min_dist_between_loci) # step (2). merge the fusion exons for recs in iter_gmap_sam_for_fusion(sam_filename, fusion_candidates, bs.transfrag_len_dict): for v in recs.itervalues(): if len(v) > 0: o = merge_fusion_exons(v, max_fusion_point_dist=100, max_exon_end_dist=0, allow_extra_5_exons=allow_extra_5_exons) for group in o: merged_exons.append(group) for r in group: compressed_records_pointer_dict[r.qID].append(merged_i) merged_i += 1 # step (3). use BranchSimple to write a temporary file # f_good = open(output_prefix + '.gff', 'w') f_group = open('branch_tmp.group.txt', 'w') # f_bad = f_good gene_index = 1 already_seen = set() for qid, indices in compressed_records_pointer_dict.iteritems(): combo = tuple(indices) if combo in already_seen: print "combo seen:", combo #raw_input("") continue already_seen.add(combo) # if gene_index == 7: # pdb.set_trace() for isoform_index, i in enumerate(indices): bs.cuff_index = gene_index # for set to the same records = merged_exons[i] f_group.write("{p}.{i}.{j}\t{ids}\n".format( p="PBfusion", i=gene_index, j=isoform_index, ids=",".join(r.qID for r in records))) # bs.process_records(records, allow_extra_5_exons, skip_5_exon_alt, \ # f_good, f_bad, f_group, tolerate_end=100, \ # starting_isoform_index=isoform_index, gene_prefix='PBfusion') gene_index += 1 # f_good.close() # f_bad.close() f_group.close() # step (4). read the tmp file and modify to display per fusion gene f_group = open(output_prefix + '.group.txt', 'w') group_info = {} # ex: PBfusion.1 --> [id1, id2, id3...] count = 0 with open('branch_tmp.group.txt') as f: while True: line = f.readline().strip() if len(line) == 0: break pbid1, groups1 = line.strip().split('\t') pbid2, groups2 = f.readline().strip().split('\t') assert pbid1.split('.')[1] == pbid2.split('.')[1] group = set(groups1.split(',')).intersection(groups2.split(',')) f_group.write("{0}\t{1}\n".format(pbid1[:pbid1.rfind('.')], ",".join(group))) group_info[pbid1[:pbid1.rfind('.')]] = list(group) count += 1 f_group.close() #os.remove('branch_tmp.group.txt') gff_filename = output_prefix + '.gff' group_filename = output_prefix + '.group.txt' if is_fq: output_filename = output_prefix + '.rep.fq' else: output_filename = output_prefix + '.rep.fa' pick_rep(fa_or_fq_filename, sam_filename, gff_filename, group_filename, output_filename, is_fq=is_fq, pick_least_err_instead=False) print >> sys.stderr, "{0} fusion candidates identified.".format(count) print >> sys.stderr, "Output written to: {0}.gff, {0}.group.txt, {1}".format( output_prefix, output_filename) # (optional) step 5. get count information if cluster_report_csv is not None: get_abundance_post_collapse(output_prefix, cluster_report_csv, output_prefix) print >> sys.stderr, "Count information written to: {0}.abundance.txt".format( output_prefix)