def main(args):    
    if not os.path.exists(args.input):
        print >> sys.stderr, "Input file {0} does not exist. Abort.".format(args.fasta)
        sys.exit(-1)
    
    if not os.path.exists(args.sam):
        print >> sys.stderr, "SAM file {0} does not exist. Abort.".format(args.sam)
        sys.exit(-1)

    # check for duplicate IDs
    check_ids_unique(args.input, is_fq=args.fq)
    
    ignored_fout = open(args.prefix + '.ignored_ids.txt', 'w')

    if args.flnc_coverage > 0:
        f_good = open(args.prefix + '.collapsed.good.gff', 'w')
        f_bad = open(args.prefix + '.collapsed.bad.gff', 'w')
        cov_threshold = args.flnc_coverage
    else:
        f_good = open(args.prefix + '.collapsed.gff', 'w')
        f_bad = f_good
        cov_threshold = 1
    f_txt = open(args.prefix + '.collapsed.group.txt', 'w')
    
    b = branch_simple2.BranchSimple(args.input, cov_threshold=cov_threshold, min_aln_coverage=args.min_aln_coverage, min_aln_identity=args.min_aln_identity, is_fq=args.fq)
    iter = b.iter_gmap_sam(args.sam, ignored_fout)
    for recs in iter:
        for v in recs.itervalues():
            if len(v) > 0: b.process_records(v, args.allow_extra_5exon, False, f_good, f_bad, f_txt)
    
    ignored_fout.close()
    f_good.close()
    try:
        f_bad.close()
    except:
        pass
    f_txt.close()

    if args.max_fuzzy_junction > 0: # need to further collapse those that have fuzzy junctions!
        collapse_fuzzy_junctions(f_good.name, f_txt.name, args.allow_extra_5exon, internal_fuzzy_max_dist=args.max_fuzzy_junction)
        os.rename(f_good.name, f_good.name+'.unfuzzy')
        os.rename(f_txt.name, f_txt.name+'.unfuzzy')
        os.rename(f_good.name+'.fuzzy', f_good.name)
        os.rename(f_txt.name+'.fuzzy', f_txt.name)

    if args.fq:
        outfile = args.prefix+".collapsed.rep.fq"
    else:
        outfile = args.prefix+".collapsed.rep.fa"
    if args.allow_extra_5exon: # 5merge, pick longest
        pick_rep(args.input, f_good.name, f_txt.name, outfile, is_fq=args.fq, pick_least_err_instead=False, bad_gff_filename=f_bad.name)
    else:
        pick_rep(args.input, f_good.name, f_txt.name, outfile, is_fq=args.fq, pick_least_err_instead=True, bad_gff_filename=f_bad.name)
    
    print >> sys.stderr, "Ignored IDs written to:", ignored_fout.name
    print >> sys.stderr, "Output written to:"
    print >> sys.stderr, f_good.name
    print >> sys.stderr, f_txt.name
    print >> sys.stderr, outfile
    print >> sys.stderr, args
def main(args):
    if not os.path.exists(args.input):
        print >> sys.stderr, "Input file {0} does not exist. Abort.".format(
            args.fasta)
        sys.exit(-1)

    if not os.path.exists(args.sam):
        print >> sys.stderr, "SAM file {0} does not exist. Abort.".format(
            args.sam)
        sys.exit(-1)

    # check for duplicate IDs
    check_ids_unique(args.input, is_fq=args.fq)

    ignored_fout = open(args.prefix + '.ignored_ids.txt', 'w')
    f_gff = open(args.prefix + '.collapsed.gff', 'w')
    f_txt = open(args.prefix + '.collapsed.group.txt', 'w')

    b = branch_simple2.BranchSimple(args.input,
                                    cov_threshold=1,
                                    min_aln_coverage=args.min_aln_coverage,
                                    min_aln_identity=args.min_aln_identity,
                                    is_fq=args.fq)
    iter = b.iter_gmap_sam(args.sam, ignored_fout)
    for recs in iter:
        for v in recs.itervalues():
            if len(v) > 0:
                b.process_records(v, args.allow_extra_5exon, False, f_gff,
                                  f_gff, f_txt)

    ignored_fout.close()
    f_gff.close()
    f_txt.close()

    if args.fq:
        outfile = args.prefix + ".collapsed.rep.fq"
    else:
        outfile = args.prefix + ".collapsed.rep.fa"
    pick_rep(args.input, f_gff.name, f_txt.name, outfile, is_fq=args.fq)

    print >> sys.stderr, "Ignored IDs written to:", ignored_fout.name
    print >> sys.stderr, "Output written to:"
    print >> sys.stderr, f_gff.name
    print >> sys.stderr, f_txt.name
    print >> sys.stderr, outfile
    print >> sys.stderr, args
示例#3
0
def fusion_main(fa_or_fq_filename,
                sam_filename,
                output_prefix,
                is_fq=False,
                allow_extra_5_exons=True,
                skip_5_exon_alt=True,
                prefix_dict_pickle_filename=None,
                min_locus_coverage=.05,
                min_total_coverage=.99,
                min_locus_coverage_bp=1,
                min_dist_between_loci=10000):
    """
    (1) identify fusion candidates (based on mapping, total coverage, identity, etc)
    (2) group/merge the fusion exons, using an index to point to each individual part
    (3) use BranchSimple to write out a tmp GFF where 
         PBfusion.1.1 is the first part of a fusion gene
         PBfusion.1.2 is the second part of a fusion gene
    (4) read the tmp file from <3> and modify it so that 
         PBfusion.1 just represents the fusion gene (a single transcript GFF format)
    """
    compressed_records_pointer_dict = defaultdict(lambda: [])
    merged_exons = []
    merged_i = 0

    # step (0). check for duplicate IDs
    check_ids_unique(fa_or_fq_filename, is_fq=is_fq)

    # step (1). identify fusion candidates
    bs = branch_simple2.BranchSimple(fa_or_fq_filename, is_fq=is_fq)
    fusion_candidates = find_fusion_candidates(
        sam_filename, bs.transfrag_len_dict, min_locus_coverage,
        min_locus_coverage_bp, min_total_coverage, min_dist_between_loci)

    # step (2). merge the fusion exons
    for recs in iter_gmap_sam_for_fusion(sam_filename, fusion_candidates,
                                         bs.transfrag_len_dict):
        for v in recs.itervalues():
            if len(v) > 0:
                o = merge_fusion_exons(v,
                                       max_fusion_point_dist=100,
                                       max_exon_end_dist=0,
                                       allow_extra_5_exons=allow_extra_5_exons)
                for group in o:
                    merged_exons.append(group)
                    for r in group:
                        compressed_records_pointer_dict[r.qID].append(merged_i)
                    merged_i += 1

    # step (3). use BranchSimple to write a temporary file
#    f_good = open(output_prefix + '.gff', 'w')
    f_group = open('branch_tmp.group.txt', 'w')
    #    f_bad = f_good
    gene_index = 1
    already_seen = set()
    for qid, indices in compressed_records_pointer_dict.iteritems():
        combo = tuple(indices)
        if combo in already_seen:
            print "combo seen:", combo
            #raw_input("")
            continue
        already_seen.add(combo)
        #        if gene_index == 7:
        #            pdb.set_trace()
        for isoform_index, i in enumerate(indices):
            bs.cuff_index = gene_index  # for set to the same
            records = merged_exons[i]
            f_group.write("{p}.{i}.{j}\t{ids}\n".format(
                p="PBfusion",
                i=gene_index,
                j=isoform_index,
                ids=",".join(r.qID for r in records)))
#            bs.process_records(records, allow_extra_5_exons, skip_5_exon_alt, \
#                    f_good, f_bad, f_group, tolerate_end=100, \
#                    starting_isoform_index=isoform_index, gene_prefix='PBfusion')
        gene_index += 1


#    f_good.close()
#    f_bad.close()
    f_group.close()

    # step (4). read the tmp file and modify to display per fusion gene
    f_group = open(output_prefix + '.group.txt', 'w')
    group_info = {}  # ex: PBfusion.1 --> [id1, id2, id3...]
    count = 0
    with open('branch_tmp.group.txt') as f:
        while True:
            line = f.readline().strip()
            if len(line) == 0: break
            pbid1, groups1 = line.strip().split('\t')
            pbid2, groups2 = f.readline().strip().split('\t')
            assert pbid1.split('.')[1] == pbid2.split('.')[1]
            group = set(groups1.split(',')).intersection(groups2.split(','))
            f_group.write("{0}\t{1}\n".format(pbid1[:pbid1.rfind('.')],
                                              ",".join(group)))
            group_info[pbid1[:pbid1.rfind('.')]] = list(group)
            count += 1
    f_group.close()
    #os.remove('branch_tmp.group.txt')

    gff_filename = output_prefix + '.gff'
    group_filename = output_prefix + '.group.txt'
    if is_fq:
        output_filename = output_prefix + '.rep.fq'
    else:
        output_filename = output_prefix + '.rep.fa'
    pick_rep(fa_or_fq_filename,
             sam_filename,
             gff_filename,
             group_filename,
             output_filename,
             is_fq=is_fq,
             pick_least_err_instead=False)

    print >> sys.stderr, "{0} fusion candidates identified.".format(count)
    print >> sys.stderr, "Output written to: {0}.gff, {0}.group.txt, {1}".format(
        output_prefix, output_filename)

    # (optional) step 5. get count information
    if prefix_dict_pickle_filename is not None:
        with open(prefix_dict_pickle_filename) as f:
            d = load(f)
            d1 = d['HQ']
            d1.update(d['LQ'])
        tofu_wrap.get_abundance(output_prefix, d1, output_prefix)
        print >> sys.stderr, "Count information written to: {0}.abundance.txt".format(
            output_prefix)
示例#4
0
def main(args):
    if not os.path.exists(args.input):
        print >> sys.stderr, "Input file {0} does not exist. Abort.".format(
            args.fasta)
        sys.exit(-1)

    if not os.path.exists(args.sam):
        print >> sys.stderr, "SAM file {0} does not exist. Abort.".format(
            args.sam)
        sys.exit(-1)

    # check for duplicate IDs
    check_ids_unique(args.input, is_fq=args.fq)

    ignored_fout = open(args.prefix + '.ignored_ids.txt', 'w')

    if args.flnc_coverage > 0:
        f_good = open(args.prefix + '.collapsed.good.gff', 'w')
        f_bad = open(args.prefix + '.collapsed.bad.gff', 'w')
        cov_threshold = args.flnc_coverage
    else:
        f_good = open(args.prefix + '.collapsed.gff', 'w')
        f_bad = f_good
        cov_threshold = 1
    f_txt = open(args.prefix + '.collapsed.group.txt', 'w')

    b = branch_simple2.BranchSimple(args.input,
                                    cov_threshold=cov_threshold,
                                    min_aln_coverage=args.min_aln_coverage,
                                    min_aln_identity=args.min_aln_identity,
                                    is_fq=args.fq)
    iter = b.iter_gmap_sam(args.sam, ignored_fout)
    for recs in iter:
        for v in recs.itervalues():
            if len(v) > 0:
                b.process_records(v, args.allow_extra_5exon, False, f_good,
                                  f_bad, f_txt)

    ignored_fout.close()
    f_good.close()
    try:
        f_bad.close()
    except:
        pass
    f_txt.close()

    if args.max_fuzzy_junction > 0:  # need to further collapse those that have fuzzy junctions!
        collapse_fuzzy_junctions(
            f_good.name,
            f_txt.name,
            args.allow_extra_5exon,
            internal_fuzzy_max_dist=args.max_fuzzy_junction)
        os.rename(f_good.name, f_good.name + '.unfuzzy')
        os.rename(f_txt.name, f_txt.name + '.unfuzzy')
        os.rename(f_good.name + '.fuzzy', f_good.name)
        os.rename(f_txt.name + '.fuzzy', f_txt.name)

    if args.fq:
        outfile = args.prefix + ".collapsed.rep.fq"
    else:
        outfile = args.prefix + ".collapsed.rep.fa"
    if args.allow_extra_5exon:  # 5merge, pick longest
        pick_rep(args.input,
                 f_good.name,
                 f_txt.name,
                 outfile,
                 is_fq=args.fq,
                 pick_least_err_instead=False,
                 bad_gff_filename=f_bad.name)
    else:
        pick_rep(args.input,
                 f_good.name,
                 f_txt.name,
                 outfile,
                 is_fq=args.fq,
                 pick_least_err_instead=True,
                 bad_gff_filename=f_bad.name)

    print >> sys.stderr, "Ignored IDs written to:", ignored_fout.name
    print >> sys.stderr, "Output written to:"
    print >> sys.stderr, f_good.name
    print >> sys.stderr, f_txt.name
    print >> sys.stderr, outfile
    print >> sys.stderr, args
示例#5
0
def fusion_main(fa_or_fq_filename, sam_filename, output_prefix, is_fq=False, allow_extra_5_exons=True, skip_5_exon_alt=True, prefix_dict_pickle_filename=None, min_locus_coverage=.05, min_total_coverage=.99, min_locus_coverage_bp=1, min_dist_between_loci=10000):
    """
    (1) identify fusion candidates (based on mapping, total coverage, identity, etc)
    (2) group/merge the fusion exons, using an index to point to each individual part
    (3) use BranchSimple to write out a tmp GFF where 
         PBfusion.1.1 is the first part of a fusion gene
         PBfusion.1.2 is the second part of a fusion gene
    (4) read the tmp file from <3> and modify it so that 
         PBfusion.1 just represents the fusion gene (a single transcript GFF format)
    """
    compressed_records_pointer_dict = defaultdict(lambda: [])
    merged_exons = []
    merged_i = 0
    
    # step (0). check for duplicate IDs
    check_ids_unique(fa_or_fq_filename, is_fq=is_fq)

    # step (1). identify fusion candidates
    bs = branch_simple2.BranchSimple(fa_or_fq_filename, is_fq=is_fq)
    fusion_candidates = find_fusion_candidates(sam_filename, bs.transfrag_len_dict, min_locus_coverage, min_locus_coverage_bp, min_total_coverage, min_dist_between_loci)

    # step (2). merge the fusion exons
    for recs in iter_gmap_sam_for_fusion(sam_filename, fusion_candidates, bs.transfrag_len_dict):
        for v in recs.itervalues():
            if len(v) > 0:
                o = merge_fusion_exons(v, max_fusion_point_dist=100, max_exon_end_dist=0, allow_extra_5_exons=allow_extra_5_exons)
                for group in o:
                    merged_exons.append(group)
                    for r in group: compressed_records_pointer_dict[r.qID].append(merged_i)
                    merged_i += 1

    # step (3). use BranchSimple to write a temporary file
#    f_good = open(output_prefix + '.gff', 'w')
    f_group = open('branch_tmp.group.txt', 'w')
#    f_bad = f_good
    gene_index = 1
    already_seen = set()
    for qid,indices in compressed_records_pointer_dict.iteritems():
        combo = tuple(indices)
        if combo in already_seen:
            print "combo seen:", combo
            #raw_input("")
            continue
        already_seen.add(combo)
#        if gene_index == 7:
#            pdb.set_trace()
        for isoform_index,i in enumerate(indices):
            bs.cuff_index = gene_index # for set to the same
            records = merged_exons[i]
            f_group.write("{p}.{i}.{j}\t{ids}\n".format(p="PBfusion", i=gene_index, j=isoform_index, ids=",".join(r.qID for r in records)))
#            bs.process_records(records, allow_extra_5_exons, skip_5_exon_alt, \
#                    f_good, f_bad, f_group, tolerate_end=100, \
#                    starting_isoform_index=isoform_index, gene_prefix='PBfusion')
        gene_index += 1
#    f_good.close()
#    f_bad.close()
    f_group.close()


    # step (4). read the tmp file and modify to display per fusion gene
    f_group = open(output_prefix + '.group.txt', 'w')
    group_info = {} # ex: PBfusion.1 --> [id1, id2, id3...]
    count = 0
    with open('branch_tmp.group.txt') as f:
        while True:
            line = f.readline().strip()
            if len(line) == 0: break
            pbid1, groups1 = line.strip().split('\t')
            pbid2, groups2 = f.readline().strip().split('\t')
            assert pbid1.split('.')[1] == pbid2.split('.')[1]
            group = set(groups1.split(',')).intersection(groups2.split(','))
            f_group.write("{0}\t{1}\n".format(pbid1[:pbid1.rfind('.')], ",".join(group)))
            group_info[pbid1[:pbid1.rfind('.')]] = list(group)
            count += 1
    f_group.close()
    #os.remove('branch_tmp.group.txt')

    gff_filename = output_prefix + '.gff'
    group_filename = output_prefix + '.group.txt'
    if is_fq:
        output_filename = output_prefix + '.rep.fq'
    else:
        output_filename = output_prefix + '.rep.fa'
    pick_rep(fa_or_fq_filename, sam_filename, gff_filename, group_filename, output_filename, is_fq=is_fq, pick_least_err_instead=False)

    print >> sys.stderr, "{0} fusion candidates identified.".format(count)
    print >> sys.stderr, "Output written to: {0}.gff, {0}.group.txt, {1}".format(output_prefix, output_filename)

    # (optional) step 5. get count information
    if prefix_dict_pickle_filename is not None:
        with open(prefix_dict_pickle_filename) as f:
            d = load(f)
            d1 = d['HQ']
            d1.update(d['LQ'])
        tofu_wrap.get_abundance(output_prefix, d1, output_prefix)
        print >> sys.stderr, "Count information written to: {0}.abundance.txt".format(output_prefix)